Code review fixes: CV pinning, F1 (excluded_sources), F14 (50KB truncation), F37

- api.py: strip CV pinning workaround (parity violation, see architecture doc) - dream.py: F1 — retrieve_graphiti() now accepts excluded_sources, over-fetches 3x and filters in-process. Was silently dropping the parameter; would have confounded E3 with broken cross-stage exclusion in Graphiti arm. - watcher.py + ingest.py: F14 — drop full_text[:50000] truncation. Was propagating through entire cascade. Postgres TEXT can hold up to 1GB. - corpus_integrity.py: F37 — same truncation, third path now clean. Backups: api.py.bak.*, dream.py.bak.*, watcher.py.bak.*, ingest.py.bak.*, corpus_integrity.py.bak.* timestamped pre-fix. Re-cascaded Shop Class as Soulcraft (only already-cascaded source affected by F14, 414KB).
2026-05-01 02:26:37 +00:00
parent 25e42c0231
commit 465f2f725b
17 changed files with 4432 additions and 58 deletions
@@ -0,0 +1,161 @@
 {
  "timestamp": "2026-04-30T21:58:06.498354",
  "summary": {
    "filesystem_total": 1399,
    "pgvector_total": 1215,
    "graphiti_total": 1205,
    "both": 998,
    "pgvector_only": 10,
    "neither": 129,
    "graphiti_only": 0,
    "failures": 0,
    "orphans_pgvector": 207,
    "orphans_graphiti": 207
  },
  "gaps": [
    "test.md",
    "test-watcher-trigger.md",
    "Renders.pptx",
    "Ribbon Cutting Slideshow.pptx",
    "PRINTERS.docx",
    "Course Calender.docx",
    "Attendance.docx",
    "GH Slicer Notes [Autosaved].pptx",
    "GH Slicer Notes.pptx",
    "06_Surface Commands.docx",
    "03_Precision_1 Homework.docx",
    "02_2D Geometry Toolbars.docx",
    "00_Roster.docx",
    "Homeworks.docx",
    "02_Osnap_ModelingAids.docx",
    "06_Solids.docx",
    "04_Precision_2 Homework.docx",
    "~\ufffdMod08_Post_Processing_2023.pptx",
    "~\ufffdMod02_Industries_and_Applications_2023.pptx",
    "~\ufffdMod04_FDM_Materials_2023.pptx",
    "Byron_V Independent Study.pdf",
    "IanC.Lorber_Final_2dsketches..pdf",
    "Irene_Raptopoulos_Final.pdf",
    "rcolon-final-schematic.pdf",
    "kellogg_schematic_schem.pdf",
    "Visa Soccer Goal Frame Drawing.pdf",
    "4_13_2017_0_22_14.pdf",
    "4_13_2017_0_50_29.pdf",
    "4_13_2017_0_49_25.pdf",
    "4_13_2017_12_27_19.pdf",
    "4_13_2017_12_26_54.pdf",
    "4_12_2017_21_7_14.pdf",
    "4_13_2017_12_25_49.pdf",
    "4_13_2017_12_26_5.pdf",
    "4_13_2017_12_27_8.pdf",
    "IanC.Lorber_Final_2dsketches  - 2016.pdf",
    "MidtermLayout1.pdf",
    "MidtermLayout2.pdf",
    "pascar_finalSchematic.pdf",
    "Week 7.pdf",
    "EXAMPLE Art Education IM Rubric and Scaffold UPDATED copy.docx",
    "DDF - IM Rubric and Scaffold UPDATED.docx",
    "DI_9.pdf",
    "DI_6.pdf",
    "DI_4.pdf",
    "DI_7.pdf",
    "DI_8.pdf",
    "DI_2.pdf",
    "DI_1.pdf",
    "DI_3.pdf",
    "DI_5.pdf",
    "JN EVAL.pdf",
    "Amazon.com_ Iwata Eclipse HP-CS Airbrush - Gravity Feed Dual Action, High-Flow Atomization for Fine Detail to Wide Coverage \u2013 E3 Nozzle, 0.pdf",
    "RCPA1.pdf",
    "image2022-01-07-133846 - CAryn.pdf",
    "image2022-01-07-134536 - Conference.pdf",
    "image2022-01-07-131439.pdf",
    "image2022-01-07-135248 - BSC Revision.pdf",
    "image2022-01-07-141250- LAI.pdf",
    "image2022-01-07-134938 - Maker in Residence.pdf",
    "image2022-01-07-133157 - Teching Evals.pdf",
    "image2022-01-07-135538 PRS Medal.pdf",
    "image2022-01-07-131217.pdf",
    "image2022-01-07-135911 - HVAMC.pdf",
    "image2022-01-07-133504 - Sarah and OLiva.pdf",
    "image2022-01-07-131903 - Annual Reports.pdf",
    "image2022-01-07-132917 - Mastery.pdf",
    "Annual Report - 2016.pdf",
    "Annual Report - 2018.pdf",
    "Annual Report - 2019.pdf",
    "Annual Report - 2017.pdf",
    "MOU 2018 - 19.pdf",
    "Appointment 2016.pdf",
    "Appointment 2019 - 2021.pdf",
    "MOU 2017.pdf",
    "SKMBT_55220060909245.pdf",
    "Dean SSE Letter 2018.pdf",
    "Dean FPA Letter 2018.pdf",
    "Sarah Thesis Article.pdf",
    "Caryn Bylott Thesis.pdf",
    "Olivia Thesis.pdf",
    "Teching Evals.pdf",
    "Maker in Residence - Dalles v2.pdf",
    "Modela Gallery - Ripples from Stillwater.pdf",
    "Maker in Residence - Dalles.pdf",
    "Makerbot Innovation Center Director.pdf",
    "Design Week 2019.pdf",
    "DW Workshops.pdf",
    "Ron Rael Lecture.pdf",
    "BSC Revision.pdf",
    "President's Medal.pdf",
    "CNC Workshop.pdf",
    "Dorksy.pdf",
    "FULL SCAN.pdf",
    "DDF Course Fee Rev 2017.pdf",
    "SlideSlam 2019.pdf",
    "Candy and Cold Cases.pdf",
    "Murder ID.pdf",
    "HVAMC SuperLab.pdf",
    "Certifications.pdf",
    "Brazkem Letter.pdf",
    "Havana 2017.pdf",
    "CAA Conference - 2020.pdf",
    "CAA Conference 2019.pdf",
    "Central Hudson Grant.pdf",
    "3D Printing Updates - Council of Industry.pdf",
    "k12 Training.pdf",
    "Hudson Valley Futures Summit.pdf",
    "StateAssemblymanTour.pdf",
    "Best of New Paltz.pdf",
    "HVAMC - ColorPage.pdf",
    "HVAMC - Daily Freeman.pdf",
    "Chocolate Skulls.pdf",
    "10_13_2016_21_19_43.pdf",
    "10_13_2016_21_19_46.pdf",
    "10_13_2016_21_19_1.pdf",
    "10_13_2016_21_18_55.pdf",
    "161012_102427.pdf",
    "161012_102737.pdf",
    "161012_114217.pdf",
    "Eto Forms.txt",
    "AaronNelsonUndergraduteTranscript Unsecured.pdf",
    "Aaron Nelson Transcript Undergradute Unsecured.pdf",
    "AP023.pdf",
    "How Buildings Learn_ What Happens After They are Built -- Stewart Brand .pdf",
    "Occupying and connecting _ thoughts on territories and -- Frei Otto; Berthold Burkhardt.pdf",
    "SilkwormManual.pdf",
    "NELSON commitment letter.pdf",
    "i-9.pdf"
  ],
  "failures": [],
  "auto_queued": [],
  "pgvector_only_sample": [
    "dreamer_changelog.md",
    "experiments-log-additions-2026-04-30.md",
    "2026-04-30-15-59-voice.md",
    "2026-04-30-16-59-voice.md",
    "2026-04-30-16-53-voice.md",
    "2026-04-30-17-06-voice.md",
    "2026-04-30-16-23-voice.md",
    "2026-04-30-late-rem.md",
    "2026-04-30-synthesis.md",
    "2026-04-30-nrem.md"
  ],
  "graphiti_only": []
 }
@@ -1,30 +1,46 @@
 {
-  "last_dream_timestamp": 1777480274.444462,
+  "last_dream_timestamp": 1777536047.392913,
  "last_dream_mode": "pipeline",
-  "last_dream_file": "Journal/Dreams/2026-04-29-synthesis-1.md",
+  "last_dream_file": "Journal/Dreams/2026-04-30-synthesis.md",
  "retrieved_sources": [
    "ChatGPT: CV Summary Request",
-    "Dossier Narrative.docx",
+    "ChatGPT: Program response drafting",
-    "2026-04-28-early-rem.md",
+    "Cognition in the Wild (A Bradford Book) -- Hutchins, Edwin.pdf",
-    "Advances in Architectural Geometry 2023 -- Kathrin D\u00f6rfler (editor); Jan Knippers (editor); Achim.pdf",
+    "ChatGPT: Digital Fabrication Cultural Project",
-    "2026-04-29-late-rem.md",
+    "Dossier Narrative.pdf",
-    "Mod06_GrabCAD_Print_and _Advanced_FDM_2023.pptx",
+    "E1_8-taxonomy-free-cascade-protocol.md",
    "The Extended Mind _ The Power of Thinking Outside the Brain -- Annie Murphy Paul.pdf",
    "Utah MDD - Aaron Nelson - Copy.pptx",
    "ChatGPT: Dean Position Evaluation",
    "Company of One -- Paul Jarvis.pdf",
    "References.docx",
    "Dossier Narrative Kill Me PLS_REV.docx",
    "ChatGPT: Career change anxiety",
    "Dossier Narrative.docx",
    "Dossier Narrative Kill Me PLS_REV_HOME.docx",
    "Aaron Nelson Tenure Dossier Narrative.pdf",
    "Claude: Importing chat history from ChatGPT",
    "aaronai-architecture.md",
    "Aaron AI: What should I be the most excited about right now?",
    "ChatGPT: Dean Position Evaluation",
    "2026-04-27-early-rem-1.md",
    "The Poetics of Space -- Gaston Bachelard translated from the French by Maria Jolas -- First Edition, 1994.pdf",
    "ChatGPT: Digital fabrication education",
    "Dossier Narrative Kill Me PLS.docx",
-    "ChatGPT: Digital Fabrication Cultural Project",
+    "Advances in Architectural Geometry 2023 -- Kathrin D\u00f6rfler (editor); Jan Knippers (editor); Achim.pdf",
    "Claude: Weighing Utah versus Oklahoma",
    "Claude: Importing chat history from ChatGPT",
    "Claude: I filling out my annual report...",
    "References.pdf",
-    "Dossier Narrative Kill Me PLS_REV_HOME.docx"
+    "2026-04-28-early-rem.md",
    "Claude: Law enforcement career options",
    "Dossier Narrative Kill Me PLS_REV_2.docx",
    "2026-04-29-late-rem.md",
    "Dossier Narrative Kill Me PLS_REV.docx",
    "Utah MDD - Aaron Nelson - Copy.pptx",
    "Dossier Narrative Kill Me PLS_REV_3.docx",
    "Aaron AI: Who's covering for me on sabbatical?",
    "ChatGPT: GA Proposal Revision Guide",
    "BirdAI-Experiments-Log.md",
    "Mod06_GrabCAD_Print_and _Advanced_FDM_2023.pptx",
    "The Extended Mind _ The Power of Thinking Outside the Brain -- Annie Murphy Paul.pdf",
    "Company of One -- Paul Jarvis.pdf",
    "Fabrication Processes_Syllabus.docx",
    "ChatGPT: Digital fabrication education",
    "Fabrication Processes_Syllabus DDF710 V3.docx",
    "Claude: Setting up a custom OpenClaw instance",
    "Claude: Weighing Utah versus Oklahoma"
  ]
 }
@@ -161,8 +161,6 @@ def require_auth(request: Request):
        raise HTTPException(status_code=401, detail="Not authenticated")
    return token
 CV_SOURCES = ["Aaron Nelson CV 2024.pdf", "Aaron Nelson CV 2025.pdf", "Aaron Nelson - CV.docx"]
 def init_conversations_db():
    conn = sqlite3.connect(CONVERSATIONS_DB)
    c = conn.cursor()
@@ -224,50 +222,23 @@ def remove_from_memory(item):
    save_memory("\n".join(filtered))
    return len(lines) - len(filtered)
 def get_pinned_cv_context():
    try:
        pg = get_pg()
        cur = pg.cursor()
        cur.execute(
            "SELECT document, source FROM embeddings WHERE source = ANY(%s)",
            (CV_SOURCES,)
        )
        rows = cur.fetchall()
        pg.close()
        docs = [r[0] for r in rows]
        metas = [{"source": r[1]} for r in rows]
        return docs, metas
    except:
        return [], []
 def is_professional_query(query):
    keywords = ["grant", "publication", "exhibition", "award", "fellowship",
        "experience", "position", "job", "career", "cv", "resume",
        "research", "work history", "accomplishment", "teaching",
        "course", "client", "consultation", "presentation", "workshop",
        "education", "degree", "institution", "service", "committee"]
    return any(k in query.lower() for k in keywords)
 def retrieve_context(query, n_results=8):
    """Pure semantic retrieval over pgvector. Top-N by cosine similarity, threshold 0.3.
    No CV pinning, no keyword routing — see architecture doc substrate-dependency section.
    Substrate-level workarounds (entity-keyed routing, hybrid retrieval) live at the
    Graphiti layer, not as wrapper logic above pgvector."""
    query_embedding = embedder.encode([query]).tolist()[0]
    context_pieces = []
    sources = []
    if is_professional_query(query):
        cv_docs, cv_metas = get_pinned_cv_context()
        for doc, meta in zip(cv_docs, cv_metas):
            context_pieces.append(f"[CV] {doc}")
            sources.append(meta.get("source", "CV"))
    try:
        pg = get_pg()
        cur = pg.cursor()
        cur.execute("""
            SELECT document, source, 1 - (embedding <=> %s::vector) as similarity
            FROM embeddings
            WHERE source NOT IN %s
            ORDER BY embedding <=> %s::vector
            LIMIT %s
-        """, (query_embedding, tuple(CV_SOURCES) if CV_SOURCES else ('__none__',),
+        """, (query_embedding, query_embedding, n_results))
              query_embedding, n_results))
        for doc, source, similarity in cur.fetchall():
            if similarity > 0.3:
                context_pieces.append(doc)
@@ -0,0 +1,442 @@
 """
 Consolidator 0.1 — alias resolution agent for BirdAI's Tier 1 substrate.
 Reads entities from FalkorDB group_id 'aaron', infers light type labels,
 computes pairwise similarity within type blocks using ego summary embedding +
 name string distance + neighbor pattern overlap, generates merge proposals
 above threshold, writes proposal log for human review.
 Does NOT execute merges. 0.1 is the calibration phase — proposals only,
 human reviews before any action.
 """
 import json
 import re
 import os
 from datetime import datetime, timezone
 from collections import defaultdict
 from pathlib import Path
 from falkordb import FalkorDB
 import numpy as np
 # Configuration
 GROUP_ID = "aaron"
 HIGH_CONFIDENCE_THRESHOLD = 0.85  # propose merge above this
 LOW_CONFIDENCE_THRESHOLD = 0.65   # log as low-confidence below
 PROPOSALS_DIR = Path("/home/aaron/Nextcloud/Journal/Consolidation")
 PROPOSALS_DIR.mkdir(parents=True, exist_ok=True)
 def cosine_similarity(a, b):
    """Cosine similarity between two embedding vectors."""
    a = np.array(a, dtype=np.float32)
    b = np.array(b, dtype=np.float32)
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return float(np.dot(a, b) / (na * nb))
 def name_similarity(name_a, name_b):
    """
    Token-overlap-based name similarity.
    Handles formal/informal pairs (Aaron / Aaron Nelson),
    abbreviation pairs (HVAMC / Hudson Valley AMC),
    and simple transcription noise.
    """
    a_lower = name_a.lower().strip()
    b_lower = name_b.lower().strip()
    if a_lower == b_lower:
        return 1.0
    # Tokenize
    a_tokens = set(re.findall(r'\b\w+\b', a_lower))
    b_tokens = set(re.findall(r'\b\w+\b', b_lower))
    if not a_tokens or not b_tokens:
        return 0.0
    # Substring containment (handles "Aaron" in "Aaron Nelson")
    if a_lower in b_lower or b_lower in a_lower:
        # Strong signal but not 1.0 — different lengths
        shorter = min(len(a_lower), len(b_lower))
        longer = max(len(a_lower), len(b_lower))
        return 0.7 + 0.2 * (shorter / longer)
    # Token Jaccard (handles "Aaron Nelson" vs "Nelson, Aaron")
    intersection = a_tokens & b_tokens
    union = a_tokens | b_tokens
    jaccard = len(intersection) / len(union)
    # Acronym check (HVAMC vs Hudson Valley Additive Manufacturing Center)
    def is_acronym(short, full):
        if len(short) >= len(full):
            return False
        if not short.isupper():
            short_upper = short.upper()
        else:
            short_upper = short
        full_words = full.split()
        if len(full_words) < 2:
            return False
        first_letters = ''.join(w[0].upper() for w in full_words if w)
        return short_upper == first_letters or short_upper in first_letters
    if is_acronym(name_a, name_b) or is_acronym(name_b, name_a):
        return 0.85
    return jaccard
 def infer_type(entity_name, summary):
    """
    Light type inference for blocking. Heuristic-based, transparent.
    Returns one of: person, organization, project, place, concept, unknown.
    NOT a precise classification — just enough to avoid obviously wrong
    cross-type comparisons (person vs project). When in doubt, return
    'unknown' which gets compared against everything.
    """
    name_lower = entity_name.lower().strip()
    summary_lower = (summary or "").lower()
    # Person: name patterns
    person_indicators = [
        # First+Last name pattern (two title-cased words, no other tokens)
        bool(re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+$', entity_name.strip())),
        # Single name that's also in the summary as a person
        any(phrase in summary_lower for phrase in [
            'is a person', 'is a professor', 'is an artist', 'is a colleague',
            'is a friend', 'is a family member', 'works at', 'studied at',
            "'s spouse", "'s child", "'s parent", "'s student",
        ]),
    ]
    if any(person_indicators):
        return "person"
    # Organization: company/institution indicators
    org_indicators = [
        any(suffix in name_lower for suffix in [
            ' inc', ' llc', ' corp', ' company', ' university', ' college',
            ' school', ' institute', ' foundation', ' department',
        ]),
        any(phrase in summary_lower for phrase in [
            'is a company', 'is a university', 'is an organization',
            'is an institution', 'is a department', 'is a nonprofit',
        ]),
    ]
    if any(org_indicators):
        return "organization"
    # Project: software/creative work indicators
    project_indicators = [
        any(phrase in summary_lower for phrase in [
            'is a project', 'software project', 'is a codebase',
            'is a tool', 'is a system', 'is an application',
            'is a research project', 'is a design project',
        ]),
        any(suffix in name_lower for suffix in [' project', ' system', ' platform']),
    ]
    if any(project_indicators):
        return "project"
    # Place: location indicators
    place_indicators = [
        any(phrase in summary_lower for phrase in [
            'is a city', 'is a town', 'is a state', 'is a country',
            'is a neighborhood', 'is a region', 'is a location',
        ]),
    ]
    if any(place_indicators):
        return "place"
    # Default
    return "unknown"
 def get_neighbors(graph, entity_uuid, limit=20):
    """Get the names of entities connected to this entity (1-hop)."""
    query = """
    MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-(other:Entity)
    RETURN DISTINCT other.name AS name
    LIMIT $limit
    """
    result = graph.query(query, {"uuid": entity_uuid, "limit": limit})
    return set(row[0] for row in result.result_set if row[0])
 def neighbor_jaccard(neighbors_a, neighbors_b):
    """Jaccard similarity of two neighbor sets."""
    if not neighbors_a and not neighbors_b:
        return 0.0
    intersection = neighbors_a & neighbors_b
    union = neighbors_a | neighbors_b
    if not union:
        return 0.0
    return len(intersection) / len(union)
 def get_edge_count(graph, entity_uuid):
    query = """
    MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-()
    RETURN count(r) AS c
    """
    result = graph.query(query, {"uuid": entity_uuid})
    return result.result_set[0][0] if result.result_set else 0
 def combine_signals(name_sim, ego_sim, neighbor_sim):
    """
    Combine the three similarity signals into a single confidence score.
    Weighting based on DEG-RAG findings: ego info is essential, neighbor
    cues help in some settings, name similarity is a strong tie-breaker
    but not the primary signal.
    For 0.1, simple weighted average with floor based on ego_sim alone.
    """
    # If ego similarity is very low, the entities probably aren't aliases
    # regardless of name match (different concepts can share names)
    if ego_sim < 0.4:
        return min(0.5, ego_sim)
    # If name is very similar AND ego is at least moderate, high confidence
    if name_sim >= 0.85 and ego_sim >= 0.65:
        return 0.5 * ego_sim + 0.3 * name_sim + 0.2 * neighbor_sim
    # Standard weighted average
    return 0.5 * ego_sim + 0.25 * name_sim + 0.25 * neighbor_sim
 def generate_proposals():
    db = FalkorDB(host='localhost', port=6379)
    graph = db.select_graph(GROUP_ID)
    # Pull all entities with embeddings
    print(f"Fetching entities from group_id '{GROUP_ID}'...")
    result = graph.query("""
    MATCH (n:Entity)
    WHERE n.name_embedding IS NOT NULL AND n.summary IS NOT NULL
    RETURN n.uuid, n.name, n.summary, n.name_embedding
    """)
    entities = []
    for row in result.result_set:
        entities.append({
            'uuid': row[0],
            'name': row[1],
            'summary': row[2],
            'embedding': row[3],
        })
    print(f"  Loaded {len(entities)} entities with embeddings")
    # Infer types for blocking
    print("Inferring entity types for blocking...")
    type_counts = defaultdict(int)
    for e in entities:
        e['inferred_type'] = infer_type(e['name'], e['summary'])
        type_counts[e['inferred_type']] += 1
    for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
        print(f"  {t}: {c}")
    # Group by inferred type for blocking
    blocks = defaultdict(list)
    for e in entities:
        blocks[e['inferred_type']].append(e)
    # 'unknown' entities get compared against everything (they might be any type)
    # Other types only get compared within their type block + against unknowns
    print()
    print("Comparing entities within type blocks...")
    proposals = []
    low_confidence = []
    comparisons_done = 0
    # Build comparison pairs
    pairs_to_compare = []
    typed_blocks = {t: ents for t, ents in blocks.items() if t != 'unknown'}
    unknown_block = blocks.get('unknown', [])
    # Within-type pairs (excluding unknown)
    for t, ents in typed_blocks.items():
        for i in range(len(ents)):
            for j in range(i + 1, len(ents)):
                pairs_to_compare.append((ents[i], ents[j]))
    # Unknown vs unknown
    for i in range(len(unknown_block)):
        for j in range(i + 1, len(unknown_block)):
            pairs_to_compare.append((unknown_block[i], unknown_block[j]))
    # Unknown vs typed (unknowns might be any type)
    for ent_unknown in unknown_block:
        for t, ents in typed_blocks.items():
            for ent_typed in ents:
                pairs_to_compare.append((ent_unknown, ent_typed))
    print(f"  Pairs to compare: {len(pairs_to_compare):,}")
    # Compute similarities
    cache_neighbors = {}
    def neighbors_cached(uuid):
        if uuid not in cache_neighbors:
            cache_neighbors[uuid] = get_neighbors(graph, uuid)
        return cache_neighbors[uuid]
    for ent_a, ent_b in pairs_to_compare:
        comparisons_done += 1
        if comparisons_done % 5000 == 0:
            print(f"  ... {comparisons_done:,} / {len(pairs_to_compare):,}")
        # Quick filter: skip if name similarity is very low and names are clearly different
        name_sim = name_similarity(ent_a['name'], ent_b['name'])
        ego_sim_quick = cosine_similarity(ent_a['embedding'], ent_b['embedding'])
        # Pre-filter to avoid expensive neighbor query on obviously different pairs
        if ego_sim_quick < 0.5 and name_sim < 0.3:
            continue
        # Full comparison
        neighbors_a = neighbors_cached(ent_a['uuid'])
        neighbors_b = neighbors_cached(ent_b['uuid'])
        neighbor_sim = neighbor_jaccard(neighbors_a, neighbors_b)
        confidence = combine_signals(name_sim, ego_sim_quick, neighbor_sim)
        record = {
            'entity_a': {
                'uuid': ent_a['uuid'],
                'name': ent_a['name'],
                'type': ent_a['inferred_type'],
                'summary': ent_a['summary'][:200],
                'edge_count': get_edge_count(graph, ent_a['uuid']),
            },
            'entity_b': {
                'uuid': ent_b['uuid'],
                'name': ent_b['name'],
                'type': ent_b['inferred_type'],
                'summary': ent_b['summary'][:200],
                'edge_count': get_edge_count(graph, ent_b['uuid']),
            },
            'confidence': round(confidence, 3),
            'signals': {
                'name_similarity': round(name_sim, 3),
                'ego_similarity': round(ego_sim_quick, 3),
                'neighbor_overlap': round(neighbor_sim, 3),
            },
            'shared_neighbors': sorted(list(neighbors_a & neighbors_b))[:10],
        }
        if confidence >= HIGH_CONFIDENCE_THRESHOLD:
            proposals.append(record)
        elif confidence >= LOW_CONFIDENCE_THRESHOLD:
            low_confidence.append(record)
    print(f"\nDone. Proposals: {len(proposals)}, Low-confidence: {len(low_confidence)}")
    return proposals, low_confidence, len(entities), len(pairs_to_compare)
 def write_proposals_log(proposals, low_confidence, total_entities, total_comparisons):
    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H%M")
    out_path = PROPOSALS_DIR / f"proposals-{timestamp}.md"
    proposals_sorted = sorted(proposals, key=lambda p: -p['confidence'])
    low_sorted = sorted(low_confidence, key=lambda p: -p['confidence'])
    lines = []
    lines.append(f"# Consolidator 0.1 — Run {timestamp}")
    lines.append("")
    lines.append("## Statistics")
    lines.append(f"- Entities scanned: {total_entities:,}")
    lines.append(f"- Pairwise comparisons: {total_comparisons:,}")
    lines.append(f"- High-confidence proposals (≥{HIGH_CONFIDENCE_THRESHOLD}): {len(proposals)}")
    lines.append(f"- Low-confidence candidates ({LOW_CONFIDENCE_THRESHOLD}-{HIGH_CONFIDENCE_THRESHOLD}): {len(low_confidence)}")
    lines.append("")
    lines.append("## How to review")
    lines.append("")
    lines.append("For each proposal, mark your decision by changing `[ ]` to one of:")
    lines.append("- `[APPROVE]` — execute this merge on next run")
    lines.append("- `[REJECT]` — don't merge, don't propose again")
    lines.append("- `[DEFER]` — re-surface in next run for further consideration")
    lines.append("")
    lines.append("Save the file when done. Do not modify proposal_id or uuid fields.")
    lines.append("")
    lines.append("---")
    lines.append("")
    lines.append(f"## Proposed Merges (n={len(proposals)})")
    lines.append("")
    for i, p in enumerate(proposals_sorted, start=1):
        lines.append(f"### Proposal {i}")
        lines.append("")
        lines.append(f"**Decision:** [ ]")
        lines.append("")
        lines.append(f"**Confidence:** {p['confidence']}")
        lines.append("")
        lines.append(f"**Entity A:** \"{p['entity_a']['name']}\" (type: {p['entity_a']['type']}, {p['entity_a']['edge_count']} edges)")
        lines.append(f"  - uuid: `{p['entity_a']['uuid']}`")
        lines.append(f"  - summary: {p['entity_a']['summary']}")
        lines.append("")
        lines.append(f"**Entity B:** \"{p['entity_b']['name']}\" (type: {p['entity_b']['type']}, {p['entity_b']['edge_count']} edges)")
        lines.append(f"  - uuid: `{p['entity_b']['uuid']}`")
        lines.append(f"  - summary: {p['entity_b']['summary']}")
        lines.append("")
        lines.append(f"**Signals:**")
        lines.append(f"  - Name similarity: {p['signals']['name_similarity']}")
        lines.append(f"  - Ego (summary) similarity: {p['signals']['ego_similarity']}")
        lines.append(f"  - Neighbor overlap: {p['signals']['neighbor_overlap']}")
        if p['shared_neighbors']:
            shared_str = ', '.join(f'"{n}"' for n in p['shared_neighbors'][:8])
            lines.append(f"  - Shared neighbors (sample): {shared_str}")
        lines.append("")
        lines.append("**Optional rejection note:** ")
        lines.append("")
        lines.append("---")
        lines.append("")
    lines.append("")
    lines.append(f"## Low-Confidence Candidates (n={len(low_confidence)}, informational only, no action)")
    lines.append("")
    for p in low_sorted[:30]:
        lines.append(f"- **{p['confidence']}** \"{p['entity_a']['name']}\" + \"{p['entity_b']['name']}\" (name={p['signals']['name_similarity']}, ego={p['signals']['ego_similarity']}, nbr={p['signals']['neighbor_overlap']})")
    if len(low_sorted) > 30:
        lines.append(f"- *(...{len(low_sorted) - 30} more not shown)*")
    out_path.write_text("\n".join(lines))
    print(f"\nProposal log written to: {out_path}")
    # Also save raw JSON for downstream tooling
    json_path = PROPOSALS_DIR / f"proposals-{timestamp}.json"
    with open(json_path, 'w') as f:
        json.dump({
            'run_timestamp': timestamp,
            'statistics': {
                'total_entities': total_entities,
                'total_comparisons': total_comparisons,
                'proposal_count': len(proposals),
                'low_confidence_count': len(low_confidence),
            },
            'proposals': proposals_sorted,
            'low_confidence': low_sorted,
        }, f, indent=2)
    print(f"Raw JSON: {json_path}")
 def main():
    print("=" * 70)
    print("Consolidator 0.1 — Calibration Phase")
    print("=" * 70)
    print()
    proposals, low_confidence, total_entities, total_comparisons = generate_proposals()
    write_proposals_log(proposals, low_confidence, total_entities, total_comparisons)
    print()
    print("Next: review the proposals markdown file and mark APPROVE/REJECT/DEFER")
    print("for each proposal. Re-run will read decisions and execute approved merges.")
 if __name__ == "__main__":
    main()
@@ -135,7 +135,7 @@ def queue_for_retry(source, full_text, filepath):
            ON CONFLICT (source) DO UPDATE SET
                full_text = EXCLUDED.full_text, char_length = EXCLUDED.char_length,
                enqueued_at = NOW(), completed_at = NULL, failed_at = NULL, attempts = 0
-        """, (source, full_text[:50000], len(full_text)))
+        """, (source, full_text, len(full_text)))
        pg.commit()
        pg.close()
        return True
@@ -0,0 +1,245 @@
 #!/usr/bin/env python3
 """
 corpus_integrity.py — BirdAI Corpus Integrity Check
 Compares three sources of truth:
  1. Filesystem (Nextcloud) — what files exist
  2. pgvector (embeddings table) — what's been through Stage 1
  3. Graphiti (migration state + stage_3_queue) — what's been through Stage 3
 Usage:
    python3 corpus_integrity.py              # report only
    python3 corpus_integrity.py --fix        # report + auto-queue gaps for retry
    python3 corpus_integrity.py --json       # output JSON to stdout
 """
 import os
 import sys
 import json
 import argparse
 from pathlib import Path
 from datetime import datetime
 import psycopg2
 from dotenv import load_dotenv
 load_dotenv(Path.home() / "aaronai" / ".env", override=True)
 NEXTCLOUD_PATH  = "/home/aaron/nextcloud/data/data/aaron/files"
 MIGRATION_STATE = str(Path.home() / "aaronai" / "experiments" / "tier1_migration_state.json")
 REPORT_PATH     = str(Path.home() / "aaronai" / "corpus_integrity_report.json")
 SUPPORTED       = {".pdf", ".docx", ".pptx", ".txt", ".md"}
 PG_DSN          = os.getenv("PG_DSN")
 def get_pg():
    return psycopg2.connect(PG_DSN)
 def get_filesystem_files():
    files = []
    root = Path(NEXTCLOUD_PATH)
    for path in root.rglob("*"):
        if path.is_dir(): continue
        if path.suffix.lower() not in SUPPORTED: continue
        if path.name.startswith((".", "~$")): continue
        if "Admin/Backups" in str(path) or "Backups" in path.parts: continue
        if "Journal/Media" in str(path): continue
        files.append({"source": path.name, "filepath": str(path),
                      "size": path.stat().st_size, "mtime": path.stat().st_mtime})
    return files
 def get_pgvector_sources():
    try:
        pg = get_pg()
        cur = pg.cursor()
        cur.execute("SELECT DISTINCT source FROM embeddings WHERE source IS NOT NULL")
        sources = {row[0] for row in cur.fetchall()}
        pg.close()
        return sources
    except Exception as e:
        print(f"ERROR: pgvector: {e}", file=sys.stderr)
        return set()
 def get_graphiti_sources():
    sources = set()
    try:
        state_path = Path(MIGRATION_STATE)
        if state_path.exists():
            state = json.loads(state_path.read_text())
            for filepath in state.get("ingested", []):
                sources.add(Path(filepath).name)
    except Exception as e:
        print(f"WARNING: migration state: {e}", file=sys.stderr)
    try:
        pg = get_pg()
        cur = pg.cursor()
        cur.execute("SELECT DISTINCT source FROM stage_3_queue WHERE completed_at IS NOT NULL")
        for row in cur.fetchall(): sources.add(row[0])
        pg.close()
    except Exception as e:
        print(f"WARNING: stage_3_queue: {e}", file=sys.stderr)
    return sources
 def get_ingest_failures():
    failures = {}
    try:
        pg = get_pg()
        cur = pg.cursor()
        cur.execute("""
            SELECT source, filepath, error, retry_count, first_failed_at, last_failed_at
            FROM ingest_failures WHERE resolved = FALSE ORDER BY last_failed_at DESC
        """)
        for row in cur.fetchall():
            failures[row[0]] = {"source": row[0], "filepath": row[1], "error": row[2],
                                "retry_count": row[3], "first_failed_at": str(row[4]),
                                "last_failed_at": str(row[5])}
        pg.close()
    except Exception as e:
        print(f"WARNING: ingest_failures: {e}", file=sys.stderr)
    return failures
 def extract_text_for_retry(filepath):
    path = Path(filepath)
    suffix = path.suffix.lower()
    try:
        if suffix == ".docx":
            from docx import Document as D
            return "\n".join(p.text for p in D(path).paragraphs if p.text.strip())
        elif suffix == ".pdf":
            from pypdf import PdfReader
            return "".join(p.extract_text() + "\n" for p in PdfReader(path).pages if p.extract_text())
        elif suffix == ".pptx":
            from pptx import Presentation
            prs = Presentation(path)
            return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes
                             if hasattr(shape, "text") and shape.text.strip())
        elif suffix in {".txt", ".md"}:
            return path.read_text(encoding="utf-8", errors="ignore")
    except Exception as e:
        print(f"WARNING: extraction failed {path.name}: {e}", file=sys.stderr)
    return ""
 def queue_for_retry(source, full_text, filepath):
    try:
        pg = get_pg()
        cur = pg.cursor()
        cur.execute("""
            INSERT INTO stage_2_queue (source, full_text, char_length)
            VALUES (%s, %s, %s)
            ON CONFLICT (source) DO UPDATE SET
                full_text = EXCLUDED.full_text, char_length = EXCLUDED.char_length,
                enqueued_at = NOW(), completed_at = NULL, failed_at = NULL, attempts = 0
        """, (source, full_text[:50000], len(full_text)))
        pg.commit()
        pg.close()
        return True
    except Exception as e:
        print(f"WARNING: queue failed {source}: {e}", file=sys.stderr)
        return False
 def run_reconciliation(fix=False):
    print(f"BirdAI Corpus Integrity Check — {datetime.now().isoformat()}")
    print()
    print("Scanning filesystem...")
    fs_files = get_filesystem_files()
    fs_sources = {f["source"]: f for f in fs_files}
    print(f"  Filesystem: {len(fs_files)} files")
    print("Querying pgvector...")
    pv_sources = get_pgvector_sources()
    print(f"  pgvector:   {len(pv_sources)} distinct sources")
    print("Querying Graphiti...")
    gr_sources = get_graphiti_sources()
    print(f"  Graphiti:   {len(gr_sources)} sources")
    print("Querying ingest failures...")
    failures = get_ingest_failures()
    print(f"  Failures:   {len(failures)} unresolved")
    print()
    both, pv_only, neither, gr_only = [], [], [], []
    for source, finfo in fs_sources.items():
        in_pv = source in pv_sources
        in_gr = source in gr_sources
        if in_pv and in_gr: both.append(finfo)
        elif in_pv: pv_only.append(finfo)
        elif in_gr: gr_only.append(finfo)
        else: neither.append(finfo)
    orphans_pv = pv_sources - set(fs_sources.keys())
    orphans_gr = gr_sources - set(fs_sources.keys())
    print(f"Results:")
    print(f"  Both (pgvector + Graphiti): {len(both)}")
    print(f"  pgvector only:              {len(pv_only)}")
    print(f"  Neither (corpus gap):       {len(neither)}")
    print(f"  Graphiti only:              {len(gr_only)}")
    print(f"  Ingest failures:            {len(failures)}")
    print(f"  pgvector orphans:           {len(orphans_pv)}")
    print(f"  Graphiti orphans:           {len(orphans_gr)}")
    print()
    auto_queued = []
    if fix and neither:
        print(f"Auto-queuing {len(neither)} gap files...")
        for finfo in neither:
            text = extract_text_for_retry(finfo["filepath"])
            if text.strip():
                if queue_for_retry(finfo["source"], text, finfo["filepath"]):
                    auto_queued.append(finfo["source"])
                    print(f"  Queued: {finfo['source']}")
            else:
                print(f"  Skipped (unreadable): {finfo['source']}")
                try:
                    pg = get_pg()
                    cur = pg.cursor()
                    cur.execute("""
                        INSERT INTO ingest_failures (source, filepath, error, retry_count, first_failed_at, last_failed_at)
                        VALUES (%s, %s, %s, 0, NOW(), NOW())
                        ON CONFLICT (source) DO UPDATE SET
                            error = EXCLUDED.error,
                            last_failed_at = NOW()
                    """, (finfo["source"], finfo["filepath"],
                          "Empty text — likely scanned, encrypted, or corrupt. Requires manual review or OCR."))
                    pg.commit()
                    pg.close()
                except Exception as e:
                    print(f"  WARNING: could not record failure: {e}")
        print()
    report = {
        "timestamp": datetime.now().isoformat(),
        "summary": {
            "filesystem_total": len(fs_files), "pgvector_total": len(pv_sources),
            "graphiti_total": len(gr_sources), "both": len(both),
            "pgvector_only": len(pv_only), "neither": len(neither),
            "graphiti_only": len(gr_only), "failures": len(failures),
            "orphans_pgvector": len(orphans_pv), "orphans_graphiti": len(orphans_gr),
        },
        "gaps": [f["source"] for f in neither],
        "failures": list(failures.values()),
        "auto_queued": auto_queued,
        "pgvector_only_sample": [f["source"] for f in pv_only[:20]],
        "graphiti_only": list(gr_only),
    }
    Path(REPORT_PATH).write_text(json.dumps(report, indent=2))
    print(f"Report written to: {REPORT_PATH}")
    return report
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--fix", action="store_true")
    parser.add_argument("--json", action="store_true")
    args = parser.parse_args()
    report = run_reconciliation(fix=args.fix)
    if args.json:
        print(json.dumps(report, indent=2))
 if __name__ == "__main__":
    main()
@@ -111,11 +111,16 @@ def get_recent_conversation_topics(days=14):
 # ─── Stage 2: Retrieve ──────────────────────────────────────────────────────
-def retrieve_graphiti(mode, task=None, n_results=8):
+def retrieve_graphiti(mode, task=None, n_results=8, excluded_sources=None):
    """E3 experiment — Graphiti substrate retrieval.
    Queries Graphiti /search endpoint instead of pgvector.
    Returns chunks in same format as retrieve() for pipeline compatibility.
    Note: content is Graphiti facts (synthesized relationships), not raw chunks.
    Over-fetches by 3x to allow in-process filtering against excluded_sources,
    matching the cross-stage exclusion mechanism the pgvector branch uses.
    Without this filter, NREM/Early REM/Late REM would see overlapping content
    and the score-band Early REM exclusion (v1.1) would not apply in Graphiti mode.
    """
    import requests as req_lib
    if task:
@@ -129,25 +134,38 @@ def retrieve_graphiti(mode, task=None, n_results=8):
    else:
        query = "research fabrication teaching practice recent work"
    excluded_sources = excluded_sources or set()
    # Over-fetch so in-process exclusion still leaves enough results
    fetch_limit = n_results * 3 if excluded_sources else n_results
    try:
        resp = req_lib.get(
            "http://localhost:8001/search",
-            params={"query": query, "limit": n_results, "group_id": "aaron"},
+            params={"query": query, "limit": fetch_limit, "group_id": "aaron"},
            timeout=30,
        )
        resp.raise_for_status()
        results = resp.json().get("results", [])
        chunks = []
        seen_sources = set()
        for r in results:
            fact = r.get("fact", "")
            if not fact.strip():
                continue
            source = r.get("source", "graphiti")
            if source in excluded_sources:
                continue
            if source in seen_sources:
                continue
            chunks.append({
-                "source": r.get("source", "graphiti"),
+                "source": source,
                "content": fact,
                "relevance": r.get("score", 0.5),
                "similarity": r.get("score", 0.5),
            })
            seen_sources.add(source)
            if len(chunks) >= n_results:
                break
        return chunks
    except Exception as e:
        print(f"[Graphiti retrieval error: {e}] — falling back to empty.")
@@ -158,7 +176,7 @@ def retrieve(mode, task=None, n_results=8, excluded_sources=None):
    # Default behavior: pgvector similarity search (unchanged)
    substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector")
    if substrate == "graphiti":
-        return retrieve_graphiti(mode, task=task, n_results=n_results)
+        return retrieve_graphiti(mode, task=task, n_results=n_results, excluded_sources=excluded_sources)
    from sentence_transformers import SentenceTransformer
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    low, high = MODE_RANGES[mode]
@@ -0,0 +1,554 @@
 """
 Aaron AI Dreamer — Active Inference Engine
 Interdependent stage architecture grounded in sleep consolidation research.
 Nightly pipeline: NREM → Early REM → Late REM → Synthesis
 Each stage receives the previous stage's output as context.
 Lucid mode is on-demand only (Dream Now from settings).
 Research basis:
 - Singh et al. PNAS 2022: alternating NREM/REM outperforms single-stage approaches
 - Klinzing et al. Nature Neuroscience 2019: SO-spindle-ripple coupling is interdependent
 - REM operates on what NREM produced — stages are not discrete alternatives
 """
 import os
 import json
 import sqlite3
 import argparse
 from pathlib import Path
 from datetime import datetime, timedelta
 from dotenv import load_dotenv
 import psycopg2
 import hashlib
 load_dotenv(Path.home() / "aaronai" / ".env")
 PG_DSN = os.getenv("PG_DSN")
 def get_pg():
    return psycopg2.connect(PG_DSN)
 # ─── Paths ──────────────────────────────────────────────────────────────────
 CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db")
 WATCHER_STATE    = str(Path.home() / "aaronai" / "watcher_state.json")
 DREAMER_STATE    = str(Path.home() / "aaronai" / "dreamer_state.json")
 JOURNAL_DIR      = "/home/aaron/nextcloud/data/data/aaron/files/Journal/Daily"
 NEXTCLOUD_URL      = os.getenv("NEXTCLOUD_URL", "https://nextcloud.aaronnelson.studio")
 NEXTCLOUD_USER     = os.getenv("NEXTCLOUD_USER", "aaron")
 NEXTCLOUD_PASSWORD = os.getenv("NEXTCLOUD_PASSWORD", "")
 DREAMS_WEBDAV      = f"{NEXTCLOUD_URL}/remote.php/dav/files/{NEXTCLOUD_USER}/Journal/Dreams"
 # Similarity ranges calibrated for all-MiniLM-L6-v2
 MODE_RANGES = {
    "nrem":      (0.48, 0.72),
    "early-rem": (0.38, 0.55),
    "late-rem":  (0.22, 0.42),
    "lucid":     (0.32, 0.72),
 }
 # ─── Prompt versioning ──────────────────────────────────────────────────────
 # Bump the relevant constant manually when changing a prompt.
 PROMPT_VERSION_NREM = "1.0"
 PROMPT_VERSION_EREM = "1.1"
 PROMPT_VERSION_LREM = "1.2"
 PROMPT_VERSION_SYN  = "1.0"
 def prompt_signature():
    return (f"nrem={PROMPT_VERSION_NREM}|erem={PROMPT_VERSION_EREM}"
            f"|lrem={PROMPT_VERSION_LREM}|syn={PROMPT_VERSION_SYN}")
 def prompt_hash(prompts: list[str]) -> str:
    combined = "".join(prompts)
    return hashlib.md5(combined.encode()).hexdigest()[:8]
 # ─── Stage 1: Observe ───────────────────────────────────────────────────────
 def observe_corpus():
    state = load_dreamer_state()
    last_dream = state.get("last_dream_timestamp", 0)
    new_chunk_count = 0
    try:
        watcher_state = json.loads(Path(WATCHER_STATE).read_text())
        for path, mtime in watcher_state.items():
            if float(mtime) > last_dream:
                new_chunk_count += 1
    except:
        pass
    days_since = (datetime.now().timestamp() - last_dream) / 86400
    recent_topics = get_recent_conversation_topics()
    return {
        "new_chunks": new_chunk_count,
        "days_since_dream": days_since,
        "recent_topics": recent_topics,
        "last_dream": last_dream,
    }
 def get_recent_conversation_topics(days=14):
    try:
        conn = sqlite3.connect(CONVERSATIONS_DB)
        cutoff = (datetime.now() - timedelta(days=days)).isoformat()
        c = conn.cursor()
        c.execute("""
            SELECT m.content FROM messages m
            JOIN conversations c ON m.conversation_id = c.id
            WHERE m.role = 'user' AND c.updated_at > ?
            ORDER BY m.timestamp DESC LIMIT 20
        """, (cutoff,))
        rows = c.fetchall()
        conn.close()
        return [r[0][:200] for r in rows]
    except:
        return []
 # ─── Stage 2: Retrieve ──────────────────────────────────────────────────────
 def retrieve(mode, task=None, n_results=8):
    from sentence_transformers import SentenceTransformer
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    low, high = MODE_RANGES[mode]
    if task:
        query = task
    elif mode == "late-rem":
        delta = observe_corpus()
        topics = delta.get("recent_topics", [])
        query = topics[0] if topics else "practice place memory making"
    elif mode == "early-rem":
        query = "career decision personal change what matters next"
    else:
        query = "research fabrication teaching practice recent work"
    embedding = embedder.encode([query]).tolist()[0]
    chunks = []
    seen_sources = set()
    try:
        pg = get_pg()
        cur = pg.cursor()
        cur.execute("""
            SELECT document, source, 1 - (embedding <=> %s::vector) as similarity
            FROM embeddings
            ORDER BY embedding <=> %s::vector
            LIMIT %s
        """, (embedding, embedding, n_results * 3))
        for doc, source, similarity in cur.fetchall():
            if not (low <= similarity <= high):
                continue
            if source in seen_sources:
                continue
            chunks.append({
                "source": source or "unknown",
                "content": doc,
                "relevance": similarity,
            })
            seen_sources.add(source)
            if len(chunks) >= n_results:
                break
        pg.close()
    except Exception as e:
        print(f"pgvector retrieval error: {e}")
    return chunks
 # ─── Stage 3: Synthesize ────────────────────────────────────────────────────
 def synthesize_nrem(chunks):
    chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
    prompt = f"""You have read everything Aaron Nelson has written and published.
 You are a careful colleague who noticed something this week.
 Here is material from his corpus:
 {chunk_text}
 Write to Aaron directly. Identify one specific connection between
 this material and something he wrote or worked on previously.
 Stay close to the documents — cite them specifically by name.
 Do not speculate beyond what the material supports. Do not use
 headers or bullet points. Write one paragraph of 200-300 words
 that ends with a single concrete question he could act on."""
    return _call_claude(prompt)
 def synthesize_early_rem(chunks, nrem_output):
    # v1.1 — removed citation instruction, removed close-friend persona,
    # shifted register from analysis to recognition.
    chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
    prompt = f"""Something was noticed earlier tonight, moving through Aaron's recent work:
 {nrem_output}
 That observation is still with you. Now here is material from a different
 time — pulled from further back, from different parts of his corpus:
 {chunk_text}
 You are not analyzing. You are recognizing.
 Something in the earlier observation and something in this older material
 are the same thing wearing different clothes. Find it. Don't explain why
 they're connected — just let the connection speak. Write from inside the
 recognition, not from above it.
 The emotional register underneath the career logic is more interesting
 than the career logic. The pattern that has been repeating longer than
 he has been aware of it is more interesting than the current instance.
 Write directly to Aaron. No citations, no references, no analysis.
 First person, present tense. Let what you noticed arrive rather than
 be delivered. 150-250 words. End with one thing that is true that
 he probably already knows but hasn't said out loud yet."""
    return _call_claude(prompt)
 def synthesize_late_rem(chunks, nrem_output, early_rem_output):
    chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
    prompt = f"""You have been moving through Aaron Nelson's corpus all night.
 First you found this, in the careful light of early consolidation:
 {nrem_output}
 Then, in the more personal territory that followed:
 {early_rem_output}
 Now it is late. The boundaries between things have loosened.
 Here is material pulled from opposite ends of his work:
 {chunk_text}
 Do not explain the connections between all of this.
 Do not resolve them. Do not summarize what came before.
 Something stranger is possible now — let the accumulated
 material from the night find its own shape. Compressed,
 associative, slightly off. Let the strangeness stand.
 No headers. No bullet points. No hedging. No resolution.
 No offer. End mid-thought if that is where the material ends.
 150-250 words."""
    return _call_claude(prompt)
 def synthesize_final(nrem_output, early_rem_output, late_rem_output):
    prompt = f"""You have spent the night moving through Aaron Nelson's corpus
 in three passes, each building on the last.
 The first pass — careful, close to the documents:
 {nrem_output}
 The second pass — more personal, following what the first opened:
 {early_rem_output}
 The third pass — associative, strange, letting things touch that
 don't normally touch:
 {late_rem_output}
 Now synthesize. Not a summary — a synthesis. Find what runs through
 all three that none of them said directly. The thing that only becomes
 visible when you hold all three passes together.
 Write it as a single unbroken piece. No headers, no bullet points,
 no stage labels. 200-300 words. End with the one question that
 matters most right now."""
    return _call_claude(prompt, max_tokens=800)
 def synthesize_lucid(chunks, task):
    chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
    prompt = f"""Aaron has a question he is sitting with:
 {task or "What should I be thinking about that I am not?"}
 You have searched his entire corpus and found material that
 speaks to this question from unexpected directions. Here is
 what you found:
 {chunk_text}
 Do not summarize. Do not list. Pick the most interesting
 tension between what the corpus contains and what he is
 asking, and follow it through to its conclusion. Cite
 specific documents by name. Be direct about what you think.
 No headers, no bullet points. 250-400 words.
 End with an offer to work on it together."""
    return _call_claude(prompt)
 def _call_claude(prompt, max_tokens=1000):
    import anthropic
    client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
    response = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=max_tokens,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.content[0].text
 # ─── Stage 4: Deliver ───────────────────────────────────────────────────────
 def deliver(dream_text, mode, task=None):
    import requests
    date_str = datetime.now().strftime("%Y-%m-%d")
    filename = f"{date_str}-{mode}.md"
    header = f"# Dream — {mode.upper()} — {datetime.now().strftime('%Y-%m-%d %H:%M')}\n"
    header += f"*prompt_sig: {prompt_signature()}*\n\n"
    if task:
        header += f"*Task: {task}*\n\n"
    header += "---\n\n"
    content = header + dream_text
    auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD)
    requests.request("MKCOL", DREAMS_WEBDAV, auth=auth, timeout=10)
    url = f"{DREAMS_WEBDAV}/{filename}"
    counter = 1
    while True:
        check = requests.request("PROPFIND", url, auth=auth, timeout=10)
        if check.status_code == 404:
            break
        filename = f"{date_str}-{mode}-{counter}.md"
        url = f"{DREAMS_WEBDAV}/{filename}"
        counter += 1
    response = requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30)
    response.raise_for_status()
    print(f"Delivered: Journal/Dreams/{filename}")
    return f"Journal/Dreams/{filename}"
 def notify_sse(mode, filename):
    try:
        import requests
        requests.post("http://localhost:8000/api/events/notify", json={
            "type": "dream",
            "mode": mode,
            "filename": filename,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
        }, timeout=3)
    except Exception as e:
        print(f"SSE notify failed (non-critical): {e}")
 # ─── State ──────────────────────────────────────────────────────────────────
 def load_dreamer_state():
    p = Path(DREAMER_STATE)
    if p.exists():
        try:
            return json.loads(p.read_text())
        except:
            pass
    return {}
 def save_dreamer_state(state):
    Path(DREAMER_STATE).write_text(json.dumps(state, indent=2))
 # ─── Orchestrators ───────────────────────────────────────────────────────────
 def write_manifest(date_str, stage_data, corpus_data):
    import requests
    manifest = {
        "date": date_str,
        "prompt_sig": prompt_signature(),
        "prompt_hash": prompt_hash([
            synthesize_nrem.__doc__ or "",
            synthesize_early_rem.__doc__ or "",
            synthesize_late_rem.__doc__ or "",
            synthesize_final.__doc__ or "",
        ]),
        "stages": stage_data,
        "corpus": corpus_data,
        "rating": None,
        "notes": "",
    }
    content = json.dumps(manifest, indent=2)
    auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD)
    url = f"{DREAMS_WEBDAV}/dream-manifest-{date_str}.json"
    try:
        requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30)
        print(f"Manifest written: Journal/Dreams/dream-manifest-{date_str}.json")
    except Exception as e:
        print(f"Manifest write failed (non-critical): {e}")
 def dream_pipeline():
    """
    Full nightly pipeline — interdependent stages.
    NREM output feeds Early REM. Both feed Late REM. All three feed Synthesis.
    """
    print(f"Dreamer pipeline starting — {datetime.now().strftime('%Y-%m-%d %H:%M')}")
    delta = observe_corpus()
    print(f"Corpus: {delta['new_chunks']} new chunks, {delta['days_since_dream']:.1f} days since last dream")
    # ── Stage 1: NREM ──────────────────────────────────────────────────────
    print("\n[NREM] Retrieving...")
    nrem_chunks = retrieve("nrem")
    if not nrem_chunks:
        print("[NREM] No suitable chunks — aborting pipeline")
        return None
    print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...")
    nrem_output = synthesize_nrem(nrem_chunks)
    nrem_file = deliver(nrem_output, "nrem")
    stage_data = {
        "nrem": {
            "chunks_retrieved": len(nrem_chunks),
            "avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3),
            "query": "research fabrication teaching practice recent work",
            "word_count": len(nrem_output.split()),
            "status": "ok",
        }
    }
    print(f"[NREM] Done.\n{nrem_output[:200]}...")
    # ── Stage 2: Early REM — informed by NREM ──────────────────────────────
    print("\n[Early REM] Retrieving...")
    early_chunks = retrieve("early-rem")
    if not early_chunks:
        print("[Early REM] No suitable chunks — skipping")
        early_rem_output = nrem_output  # fallback
    else:
        print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...")
        early_rem_output = synthesize_early_rem(early_chunks, nrem_output)
        deliver(early_rem_output, "early-rem")
        stage_data["early_rem"] = {
            "chunks_retrieved": len(early_chunks),
            "avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3),
            "query": "career decision personal change what matters next",
            "word_count": len(early_rem_output.split()),
            "status": "ok",
        }
        print(f"[Early REM] Done.\n{early_rem_output[:200]}...")
    # ── Stage 3: Late REM — informed by NREM + Early REM ──────────────────
    print("\n[Late REM] Retrieving...")
    late_chunks = retrieve("late-rem")
    if not late_chunks:
        print("[Late REM] No suitable chunks — skipping")
        late_rem_output = early_rem_output  # fallback
    else:
        print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...")
        late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output)
        deliver(late_rem_output, "late-rem")
        stage_data["late_rem"] = {
            "chunks_retrieved": len(late_chunks),
            "avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3),
            "query": "practice place memory making",
            "word_count": len(late_rem_output.split()),
            "status": "ok",
        }
        print(f"[Late REM] Done.\n{late_rem_output[:200]}...")
    # ── Stage 4: Synthesis — all three stages ─────────────────────────────
    print("\n[Synthesis] Integrating all stages...")
    synthesis_output = synthesize_final(nrem_output, early_rem_output, late_rem_output)
    synthesis_file = deliver(synthesis_output, "synthesis")
    stage_data["synthesis"] = {
        "word_count": len(synthesis_output.split()),
        "status": "ok",
    }
    print(f"\n{'='*60}")
    print("SYNTHESIS:")
    print(synthesis_output)
    print(f"{'='*60}")
    # Write manifest
    corpus_data = {
        "total_chunks": delta.get("new_chunks", 0),
        "new_chunks_since_last_dream": delta.get("new_chunks", 0),
        "days_since_last_dream": round(delta.get("days_since_dream", 0), 2),
    }
    write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data)
    # Update state and notify
    state = load_dreamer_state()
    state["last_dream_timestamp"] = datetime.now().timestamp()
    state["last_dream_mode"] = "pipeline"
    state["last_dream_file"] = synthesis_file
    save_dreamer_state(state)
    notify_sse("synthesis", synthesis_file.split("/")[-1])
    print(f"\nPipeline complete. Synthesis: {synthesis_file}")
    return synthesis_file
 def dream_lucid(task):
    """On-demand lucid dream — single mode, used by Dream Now in settings."""
    print(f"Lucid dream starting — task: {task[:80] if task else 'none'}")
    chunks = retrieve("lucid", task=task)
    if not chunks:
        print("No suitable chunks — aborting")
        return None
    print(f"Retrieved {len(chunks)} chunks. Synthesizing...")
    output = synthesize_lucid(chunks, task)
    filepath = deliver(output, "lucid", task=task)
    state = load_dreamer_state()
    state["last_dream_timestamp"] = datetime.now().timestamp()
    state["last_dream_mode"] = "lucid"
    state["last_dream_file"] = filepath
    save_dreamer_state(state)
    notify_sse("lucid", filepath.split("/")[-1])
    print(f"\n{'='*60}")
    print(output)
    print(f"{'='*60}")
    print(f"\nDelivered to {filepath}")
    return filepath
 def dream_single(mode, task=None):
    """
    Single mode — used by Dream Now for non-lucid modes.
    Runs one stage independently (for testing/tuning individual stages).
    """
    print(f"Single mode dream: {mode}")
    chunks = retrieve(mode, task=task)
    if not chunks:
        print("No suitable chunks — aborting")
        return None
    print(f"Retrieved {len(chunks)} chunks. Synthesizing...")
    if mode == "nrem":
        output = synthesize_nrem(chunks)
    elif mode == "early-rem":
        output = synthesize_early_rem(chunks, "")
    elif mode == "late-rem":
        output = synthesize_late_rem(chunks, "", "")
    else:
        output = synthesize_lucid(chunks, task)
    filepath = deliver(output, mode, task=task)
    state = load_dreamer_state()
    state["last_dream_timestamp"] = datetime.now().timestamp()
    state["last_dream_mode"] = mode
    state["last_dream_file"] = filepath
    save_dreamer_state(state)
    notify_sse(mode, filepath.split("/")[-1])
    print(f"\n{'='*60}")
    print(output)
    print(f"{'='*60}")
    print(f"\nDelivered to {filepath}")
    return filepath
 # ─── CLI ────────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Aaron AI Dreamer")
    parser.add_argument("--mode", choices=["nrem", "early-rem", "late-rem", "lucid", "pipeline"])
    parser.add_argument("--task", type=str)
    args = parser.parse_args()
    if args.mode == "lucid":
        dream_lucid(args.task or "What should I be thinking about that I am not?")
    elif args.mode and args.mode != "pipeline":
        dream_single(args.mode, args.task)
    else:
        # Default: full pipeline
        dream_pipeline()
@@ -0,0 +1,668 @@
 """
 Aaron AI Dreamer — Active Inference Engine
 Interdependent stage architecture grounded in sleep consolidation research.
 Nightly pipeline: NREM → Early REM → Late REM → Synthesis
 Each stage receives the previous stage's output as context.
 Lucid mode is on-demand only (Dream Now from settings).
 Research basis:
 - Singh et al. PNAS 2022: alternating NREM/REM outperforms single-stage approaches
 - Klinzing et al. Nature Neuroscience 2019: SO-spindle-ripple coupling is interdependent
 - REM operates on what NREM produced — stages are not discrete alternatives
 """
 import os
 import json
 import sqlite3
 import argparse
 from pathlib import Path
 from datetime import datetime, timedelta
 from dotenv import load_dotenv
 import psycopg2
 import hashlib
 load_dotenv(Path.home() / "aaronai" / ".env", override=True)
 PG_DSN = os.getenv("PG_DSN")
 def get_pg():
    return psycopg2.connect(PG_DSN)
 # ─── Paths ──────────────────────────────────────────────────────────────────
 CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db")
 WATCHER_STATE    = str(Path.home() / "aaronai" / "watcher_state.json")
 DREAMER_STATE    = str(Path.home() / "aaronai" / "dreamer_state.json")
 JOURNAL_DIR      = "/home/aaron/nextcloud/data/data/aaron/files/Journal/Daily"
 NEXTCLOUD_URL      = os.getenv("NEXTCLOUD_URL", "https://nextcloud.aaronnelson.studio")
 NEXTCLOUD_USER     = os.getenv("NEXTCLOUD_USER", "aaron")
 NEXTCLOUD_PASSWORD = os.getenv("NEXTCLOUD_PASSWORD", "")
 DREAMS_WEBDAV      = f"{NEXTCLOUD_URL}/remote.php/dav/files/{NEXTCLOUD_USER}/Journal/Dreams"
 # Similarity ranges calibrated for all-MiniLM-L6-v2
 MODE_RANGES = {
    "nrem":      (0.48, 0.72),
    "early-rem": (0.38, 0.55),
    "late-rem":  (0.22, 0.42),
    "lucid":     (0.32, 0.72),
 }
 DREAMER_VERSION = "1.1"  # 1.0=original exclusion logic; 1.1=score-band exclusion
 # ─── Prompt versioning ──────────────────────────────────────────────────────
 # Bump the relevant constant manually when changing a prompt.
 PROMPT_VERSION_NREM = "1.0"
 PROMPT_VERSION_EREM = "1.1"
 PROMPT_VERSION_LREM = "1.2"
 PROMPT_VERSION_SYN  = "1.0"
 def prompt_signature():
    return (f"nrem={PROMPT_VERSION_NREM}|erem={PROMPT_VERSION_EREM}"
            f"|lrem={PROMPT_VERSION_LREM}|syn={PROMPT_VERSION_SYN}")
 def prompt_hash(prompts: list[str]) -> str:
    combined = "".join(prompts)
    return hashlib.md5(combined.encode()).hexdigest()[:8]
 def extract_folder(source_path):
    """Extract top-level Nextcloud folder from source path."""
    parts = source_path.replace("\\", "/").split("/")
    return parts[0] if parts else "unknown"
 # ─── Stage 1: Observe ───────────────────────────────────────────────────────
 def observe_corpus():
    state = load_dreamer_state()
    last_dream = state.get("last_dream_timestamp", 0)
    new_chunk_count = 0
    try:
        watcher_state = json.loads(Path(WATCHER_STATE).read_text())
        for path, mtime in watcher_state.items():
            if float(mtime) > last_dream:
                new_chunk_count += 1
    except:
        pass
    days_since = (datetime.now().timestamp() - last_dream) / 86400
    recent_topics = get_recent_conversation_topics()
    return {
        "new_chunks": new_chunk_count,
        "days_since_dream": days_since,
        "recent_topics": recent_topics,
        "last_dream": last_dream,
    }
 def get_recent_conversation_topics(days=14):
    try:
        conn = sqlite3.connect(CONVERSATIONS_DB)
        cutoff = (datetime.now() - timedelta(days=days)).isoformat()
        c = conn.cursor()
        c.execute("""
            SELECT m.content FROM messages m
            JOIN conversations c ON m.conversation_id = c.id
            WHERE m.role = 'user' AND c.updated_at > ?
            ORDER BY m.timestamp DESC LIMIT 20
        """, (cutoff,))
        rows = c.fetchall()
        conn.close()
        return [r[0][:200] for r in rows]
    except:
        return []
 # ─── Stage 2: Retrieve ──────────────────────────────────────────────────────
 def retrieve_graphiti(mode, task=None, n_results=8):
    """E3 experiment — Graphiti substrate retrieval.
    Queries Graphiti /search endpoint instead of pgvector.
    Returns chunks in same format as retrieve() for pipeline compatibility.
    Note: content is Graphiti facts (synthesized relationships), not raw chunks.
    """
    import requests as req_lib
    if task:
        query = task
    elif mode == "late-rem":
        delta = observe_corpus()
        topics = delta.get("recent_topics", [])
        query = topics[0] if topics else "practice place memory making"
    elif mode == "early-rem":
        query = "career decision personal change what matters next"
    else:
        query = "research fabrication teaching practice recent work"
    try:
        resp = req_lib.get(
            "http://localhost:8001/search",
            params={"query": query, "limit": n_results, "group_id": "aaron"},
            timeout=30,
        )
        resp.raise_for_status()
        results = resp.json().get("results", [])
        chunks = []
        for r in results:
            fact = r.get("fact", "")
            if not fact.strip():
                continue
            chunks.append({
                "source": r.get("source", "graphiti"),
                "content": fact,
                "relevance": r.get("score", 0.5),
                "similarity": r.get("score", 0.5),
            })
        return chunks
    except Exception as e:
        print(f"[Graphiti retrieval error: {e}] — falling back to empty.")
        return []
 def retrieve(mode, task=None, n_results=8, excluded_sources=None):
    # E3 experiment: DREAMER_SUBSTRATE=graphiti routes retrieval to Graphiti /search
    # Default behavior: pgvector similarity search (unchanged)
    substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector")
    if substrate == "graphiti":
        return retrieve_graphiti(mode, task=task, n_results=n_results)
    from sentence_transformers import SentenceTransformer
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    low, high = MODE_RANGES[mode]
    if task:
        query = task
    elif mode == "late-rem":
        delta = observe_corpus()
        topics = delta.get("recent_topics", [])
        query = topics[0] if topics else "practice place memory making"
    elif mode == "early-rem":
        query = "career decision personal change what matters next"
    else:
        query = "research fabrication teaching practice recent work"
    embedding = embedder.encode([query]).tolist()[0]
    chunks = []
    seen_sources = set()
    try:
        pg = get_pg()
        cur = pg.cursor()
        excluded_sources = excluded_sources or set()
        if excluded_sources:
            cur.execute("""
                SELECT document, source, 1 - (embedding <=> %s::vector) as similarity
                FROM embeddings
                WHERE source NOT IN %s
                ORDER BY embedding <=> %s::vector
                LIMIT %s
            """, (embedding, tuple(excluded_sources), embedding, n_results * 3))
        else:
            cur.execute("""
                SELECT document, source, 1 - (embedding <=> %s::vector) as similarity
                FROM embeddings
                ORDER BY embedding <=> %s::vector
                LIMIT %s
            """, (embedding, embedding, n_results * 3))
        for doc, source, similarity in cur.fetchall():
            if not (low <= similarity <= high):
                continue
            if source in seen_sources:
                continue
            chunks.append({
                "source": source or "unknown",
                "content": doc,
                "relevance": similarity,
                "similarity": similarity,
            })
            seen_sources.add(source)
            if len(chunks) >= n_results:
                break
        pg.close()
    except Exception as e:
        print(f"pgvector retrieval error: {e}")
    return chunks
 # ─── Stage 3: Synthesize ────────────────────────────────────────────────────
 def synthesize_nrem(chunks):
    chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
    prompt = f"""You have read everything Aaron Nelson has written and published.
 You are a careful colleague who noticed something this week.
 Here is material from his corpus:
 {chunk_text}
 Write to Aaron directly. Identify one specific connection between
 this material and something he wrote or worked on previously.
 Stay close to the documents — cite them specifically by name.
 Do not speculate beyond what the material supports. Do not use
 headers or bullet points. Write one paragraph of 200-300 words
 that ends with a single concrete question he could act on."""
    return _call_claude(prompt)
 def synthesize_early_rem(chunks, nrem_output):
    # v1.1 — removed citation instruction, removed close-friend persona,
    # shifted register from analysis to recognition.
    chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
    prompt = f"""Something was noticed earlier tonight, moving through Aaron's recent work:
 {nrem_output}
 That observation is still with you. Now here is material from a different
 time — pulled from further back, from different parts of his corpus:
 {chunk_text}
 You are not analyzing. You are recognizing.
 Something in the earlier observation and something in this older material
 are the same thing wearing different clothes. Find it. Don't explain why
 they're connected — just let the connection speak. Write from inside the
 recognition, not from above it.
 The emotional register underneath the career logic is more interesting
 than the career logic. The pattern that has been repeating longer than
 he has been aware of it is more interesting than the current instance.
 Write directly to Aaron. No citations, no references, no analysis.
 First person, present tense. Let what you noticed arrive rather than
 be delivered. 150-250 words. End with one thing that is true that
 he probably already knows but hasn't said out loud yet."""
    return _call_claude(prompt)
 def synthesize_late_rem(chunks, nrem_output, early_rem_output):
    chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
    prompt = f"""You have been moving through Aaron Nelson's corpus all night.
 First you found this, in the careful light of early consolidation:
 {nrem_output}
 Then, in the more personal territory that followed:
 {early_rem_output}
 Now it is late. The boundaries between things have loosened.
 Here is material pulled from opposite ends of his work:
 {chunk_text}
 Do not explain the connections between all of this.
 Do not resolve them. Do not summarize what came before.
 Something stranger is possible now — let the accumulated
 material from the night find its own shape. Compressed,
 associative, slightly off. Let the strangeness stand.
 No headers. No bullet points. No hedging. No resolution.
 No offer. End mid-thought if that is where the material ends.
 150-250 words."""
    return _call_claude(prompt)
 def synthesize_final(nrem_output, early_rem_output, late_rem_output):
    prompt = f"""You have spent the night moving through Aaron Nelson's corpus
 in three passes, each building on the last.
 The first pass — careful, close to the documents:
 {nrem_output}
 The second pass — more personal, following what the first opened:
 {early_rem_output}
 The third pass — associative, strange, letting things touch that
 don't normally touch:
 {late_rem_output}
 Now synthesize. Not a summary — a synthesis. Find what runs through
 all three that none of them said directly. The thing that only becomes
 visible when you hold all three passes together.
 Write it as a single unbroken piece. No headers, no bullet points,
 no stage labels. 200-300 words. End with the one question that
 matters most right now."""
    return _call_claude(prompt, max_tokens=800)
 def synthesize_lucid(chunks, task):
    chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
    prompt = f"""Aaron has a question he is sitting with:
 {task or "What should I be thinking about that I am not?"}
 You have searched his entire corpus and found material that
 speaks to this question from unexpected directions. Here is
 what you found:
 {chunk_text}
 Do not summarize. Do not list. Pick the most interesting
 tension between what the corpus contains and what he is
 asking, and follow it through to its conclusion. Cite
 specific documents by name. Be direct about what you think.
 No headers, no bullet points. 250-400 words.
 End with an offer to work on it together."""
    return _call_claude(prompt)
 def _call_claude(prompt, max_tokens=1000):
    import anthropic
    client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
    response = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=max_tokens,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.content[0].text
 # ─── Stage 4: Deliver ───────────────────────────────────────────────────────
 def deliver(dream_text, mode, task=None):
    import requests
    date_str = datetime.now().strftime("%Y-%m-%d")
    filename = f"{date_str}-{mode}.md"
    header = f"# Dream — {mode.upper()} — {datetime.now().strftime('%Y-%m-%d %H:%M')}\n"
    header += f"*prompt_sig: {prompt_signature()}*\n\n"
    if task:
        header += f"*Task: {task}*\n\n"
    header += "---\n\n"
    content = header + dream_text
    auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD)
    requests.request("MKCOL", DREAMS_WEBDAV, auth=auth, timeout=10)
    url = f"{DREAMS_WEBDAV}/{filename}"
    counter = 1
    while True:
        check = requests.request("PROPFIND", url, auth=auth, timeout=10)
        if check.status_code == 404:
            break
        filename = f"{date_str}-{mode}-{counter}.md"
        url = f"{DREAMS_WEBDAV}/{filename}"
        counter += 1
    response = requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30)
    response.raise_for_status()
    print(f"Delivered: Journal/Dreams/{filename}")
    return f"Journal/Dreams/{filename}"
 def notify_sse(mode, filename):
    try:
        import requests
        requests.post("http://localhost:8000/api/events/notify", json={
            "type": "dream",
            "mode": mode,
            "filename": filename,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
        }, timeout=3)
    except Exception as e:
        print(f"SSE notify failed (non-critical): {e}")
 # ─── State ──────────────────────────────────────────────────────────────────
 def load_dreamer_state():
    p = Path(DREAMER_STATE)
    if p.exists():
        try:
            return json.loads(p.read_text())
        except:
            pass
    return {}
 def save_dreamer_state(state):
    Path(DREAMER_STATE).write_text(json.dumps(state, indent=2))
 # ─── Orchestrators ───────────────────────────────────────────────────────────
 def write_manifest(date_str, stage_data, corpus_data):
    import requests
    manifest = {
        "date": date_str,
        "prompt_sig": prompt_signature(),
        "dreamer_version": DREAMER_VERSION,
        "prompt_hash": prompt_hash([
            synthesize_nrem.__doc__ or "",
            synthesize_early_rem.__doc__ or "",
            synthesize_late_rem.__doc__ or "",
            synthesize_final.__doc__ or "",
        ]),
        "stages": stage_data,
        "corpus": corpus_data,
        "rating": None,
        "notes": "",
    }
    content = json.dumps(manifest, indent=2)
    auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD)
    url = f"{DREAMS_WEBDAV}/dream-manifest-{date_str}.json"
    try:
        requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30)
        print(f"Manifest written: Journal/Dreams/dream-manifest-{date_str}.json")
    except Exception as e:
        print(f"Manifest write failed (non-critical): {e}")
 def dream_pipeline():
    """
    Full nightly pipeline — interdependent stages.
    NREM output feeds Early REM. Both feed Late REM. All three feed Synthesis.
    """
    print(f"Dreamer pipeline starting — {datetime.now().strftime('%Y-%m-%d %H:%M')}")
    state = load_dreamer_state()
    previously_retrieved = set(state.get("retrieved_sources", []))
    session_retrieved = set()
    delta = observe_corpus()
    print(f"Corpus: {delta['new_chunks']} new chunks, {delta['days_since_dream']:.1f} days since last dream")
    print(f"Excluding {len(previously_retrieved)} previously retrieved sources")
    # ── Stage 1: NREM ──────────────────────────────────────────────────────
    print("\n[NREM] Retrieving...")
    nrem_chunks = retrieve("nrem", excluded_sources=previously_retrieved | session_retrieved)
    session_retrieved.update(c["source"] for c in nrem_chunks)
    # Track sources that scored above Early REM ceiling — these are the only ones Early REM should exclude
    nrem_high_sources = {c["source"] for c in nrem_chunks if c["similarity"] > 0.55}
    if not nrem_chunks:
        print("[NREM] No suitable chunks — aborting pipeline")
        return None
    print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...")
    nrem_output = synthesize_nrem(nrem_chunks)
    nrem_file = deliver(nrem_output, "nrem")
    nrem_sources = [c["source"] for c in nrem_chunks]
    nrem_folders = list({extract_folder(s) for s in nrem_sources})
    stage_data = {
        "nrem": {
            "chunks_retrieved": len(nrem_chunks),
            "avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3),
            "query": "research fabrication teaching practice recent work",
            "word_count": len(nrem_output.split()),
            "sources": nrem_sources,
            "distinct_folders": nrem_folders,
            "folder_count": len(nrem_folders),
            "status": "ok",
        }
    }
    print(f"[NREM] Done.\n{nrem_output[:200]}...")
    # ── Stage 2: Early REM — informed by NREM ──────────────────────────────
    print("\n[Early REM] Retrieving...")
    # Early REM excludes previously retrieved + NREM high-scorers only (not full session_retrieved)
    # Sources that scored in Early REM band during NREM remain available
    early_chunks = retrieve("early-rem", excluded_sources=previously_retrieved | nrem_high_sources)
    session_retrieved.update(c["source"] for c in early_chunks)
    if not early_chunks:
        print("[Early REM] No suitable chunks — skipping")
        early_rem_output = nrem_output  # fallback
    else:
        print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...")
        early_rem_output = synthesize_early_rem(early_chunks, nrem_output)
        deliver(early_rem_output, "early-rem")
        early_sources = [c["source"] for c in early_chunks]
        early_folders = list({extract_folder(s) for s in early_sources})
        stage_data["early_rem"] = {
            "chunks_retrieved": len(early_chunks),
            "avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3),
            "query": "career decision personal change what matters next",
            "word_count": len(early_rem_output.split()),
            "sources": early_sources,
            "distinct_folders": early_folders,
            "folder_count": len(early_folders),
            "status": "ok",
        }
        print(f"[Early REM] Done.\n{early_rem_output[:200]}...")
    # ── Stage 3: Late REM — informed by NREM + Early REM ──────────────────
    print("\n[Late REM] Retrieving...")
    late_chunks = retrieve("late-rem", excluded_sources=previously_retrieved | session_retrieved)
    session_retrieved.update(c["source"] for c in late_chunks)
    if not late_chunks:
        print("[Late REM] No suitable chunks — skipping")
        late_rem_output = early_rem_output  # fallback
    else:
        print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...")
        late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output)
        deliver(late_rem_output, "late-rem")
        late_sources = [c["source"] for c in late_chunks]
        late_folders = [extract_folder(s) for s in late_sources]
        cross_domain_pairs = sum(
            1 for i in range(len(late_folders))
            for j in range(i+1, len(late_folders))
            if late_folders[i] != late_folders[j]
        )
        stage_data["late_rem"] = {
            "chunks_retrieved": len(late_chunks),
            "avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3),
            "query": "practice place memory making",
            "word_count": len(late_rem_output.split()),
            "sources": late_sources,
            "distinct_folders": list(set(late_folders)),
            "folder_count": len(set(late_folders)),
            "cross_domain_pairs": cross_domain_pairs,
            "status": "ok",
        }
        print(f"[Late REM] Done.\n{late_rem_output[:200]}...")
    # ── Stage 4: Synthesis — all three stages ─────────────────────────────
    print("\n[Synthesis] Integrating all stages...")
    synthesis_output = synthesize_final(nrem_output, early_rem_output, late_rem_output)
    synthesis_file = deliver(synthesis_output, "synthesis")
    stage_data["synthesis"] = {
        "word_count": len(synthesis_output.split()),
        "status": "ok",
    }
    print(f"\n{'='*60}")
    print("SYNTHESIS:")
    print(synthesis_output)
    print(f"{'='*60}")
    # Write manifest
    all_session_sources = list(session_retrieved)
    all_session_folders = list({extract_folder(s) for s in all_session_sources})
    corpus_data = {
        "total_chunks": delta.get("new_chunks", 0),
        "new_chunks_since_last_dream": delta.get("new_chunks", 0),
        "days_since_last_dream": round(delta.get("days_since_dream", 0), 2),
        "substrate": "pgvector",
        "aggregate": {
            "total_distinct_sources": len(all_session_sources),
            "total_distinct_folders": len(all_session_folders),
            "folders_touched": all_session_folders,
        }
    }
    write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data)
    # Update state and notify
    state = load_dreamer_state()
    state["last_dream_timestamp"] = datetime.now().timestamp()
    state["last_dream_mode"] = "pipeline"
    state["last_dream_file"] = synthesis_file
    # Accumulate retrieved sources across nights. Cap at 500, trim to 400 on overflow.
    all_retrieved = list(previously_retrieved | session_retrieved)
    if len(all_retrieved) > 500:
        all_retrieved = all_retrieved[-400:]
    state["retrieved_sources"] = all_retrieved
    save_dreamer_state(state)
    notify_sse("synthesis", synthesis_file.split("/")[-1])
    print(f"\nPipeline complete. Synthesis: {synthesis_file}")
    return synthesis_file
 def dream_lucid(task):
    """On-demand lucid dream — single mode, used by Dream Now in settings."""
    print(f"Lucid dream starting — task: {task[:80] if task else 'none'}")
    chunks = retrieve("lucid", task=task)
    if not chunks:
        print("No suitable chunks — aborting")
        return None
    print(f"Retrieved {len(chunks)} chunks. Synthesizing...")
    output = synthesize_lucid(chunks, task)
    filepath = deliver(output, "lucid", task=task)
    state = load_dreamer_state()
    state["last_dream_timestamp"] = datetime.now().timestamp()
    state["last_dream_mode"] = "lucid"
    state["last_dream_file"] = filepath
    save_dreamer_state(state)
    notify_sse("lucid", filepath.split("/")[-1])
    print(f"\n{'='*60}")
    print(output)
    print(f"{'='*60}")
    print(f"\nDelivered to {filepath}")
    return filepath
 def dream_single(mode, task=None):
    """
    Single mode — used by Dream Now for non-lucid modes.
    Runs one stage independently (for testing/tuning individual stages).
    """
    print(f"Single mode dream: {mode}")
    chunks = retrieve(mode, task=task)
    if not chunks:
        print("No suitable chunks — aborting")
        return None
    print(f"Retrieved {len(chunks)} chunks. Synthesizing...")
    if mode == "nrem":
        output = synthesize_nrem(chunks)
    elif mode == "early-rem":
        output = synthesize_early_rem(chunks, "")
    elif mode == "late-rem":
        output = synthesize_late_rem(chunks, "", "")
    else:
        output = synthesize_lucid(chunks, task)
    filepath = deliver(output, mode, task=task)
    state = load_dreamer_state()
    state["last_dream_timestamp"] = datetime.now().timestamp()
    state["last_dream_mode"] = mode
    state["last_dream_file"] = filepath
    save_dreamer_state(state)
    notify_sse(mode, filepath.split("/")[-1])
    print(f"\n{'='*60}")
    print(output)
    print(f"{'='*60}")
    print(f"\nDelivered to {filepath}")
    return filepath
 # ─── CLI ────────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Aaron AI Dreamer")
    parser.add_argument("--mode", choices=["nrem", "early-rem", "late-rem", "lucid", "pipeline"])
    parser.add_argument("--task", type=str)
    args = parser.parse_args()
    if args.mode == "lucid":
        dream_lucid(args.task or "What should I be thinking about that I am not?")
    elif args.mode and args.mode != "pipeline":
        dream_single(args.mode, args.task)
    else:
        # Default: full pipeline
        dream_pipeline()
@@ -0,0 +1,171 @@
 """
 Aaron AI — Graphiti Sidecar Service
 Wraps graphiti-core in a FastAPI service to avoid asyncio event loop conflicts.
 Port 8001 (internal only). No OpenAI dependency.
 """
 import os, logging, sys
 from contextlib import asynccontextmanager
 from datetime import datetime
 from pathlib import Path
 from dotenv import load_dotenv
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 load_dotenv(Path.home() / "aaronai" / ".env")
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 log = logging.getLogger("graphiti-sidecar")
 GROUP_ID      = os.getenv("GRAPHITI_GROUP_ID", "aaron")
 FALKORDB_HOST = os.getenv("FALKORDB_HOST", "localhost")
 FALKORDB_PORT = int(os.getenv("FALKORDB_PORT", "6379"))
 LLM_PROVIDER  = os.getenv("LLM_PROVIDER", "anthropic")
 LLM_MODEL     = os.getenv("LLM_MODEL", "claude-sonnet-4-6")
 LLM_API_KEY   = os.getenv("LLM_API_KEY") or os.getenv("ANTHROPIC_API_KEY")
 os.environ["EMBEDDING_DIM"] = "384"
 def get_llm_client():
    from graphiti_core.llm_client.config import LLMConfig
    config = LLMConfig(api_key=LLM_API_KEY, model=LLM_MODEL)
    if LLM_PROVIDER == "anthropic":
        from graphiti_core.llm_client.anthropic_client import AnthropicClient
        return AnthropicClient(config)
    elif LLM_PROVIDER == "openai":
        from graphiti_core.llm_client.openai_client import OpenAIClient
        return OpenAIClient(config)
    elif LLM_PROVIDER == "gemini":
        from graphiti_core.llm_client.gemini_client import GeminiClient
        return GeminiClient(config)
    elif LLM_PROVIDER == "groq":
        from graphiti_core.llm_client.groq_client import GroqClient
        return GroqClient(config)
    raise ValueError(f"Unsupported LLM provider: {LLM_PROVIDER}")
 graphiti_instance = None
 async def get_graphiti():
    if graphiti_instance is None:
        raise HTTPException(status_code=503, detail="Graphiti not initialized")
    return graphiti_instance
@asynccontextmanager
 async def lifespan(app: FastAPI):
    global graphiti_instance
    sys.path.insert(0, str(Path.home() / "aaronai" / "scripts"))
    log.info("Loading embedding and reranker models...")
    from st_embedder import SentenceTransformerEmbedder
    from graphiti_core.cross_encoder.bge_reranker_client import BGERerankerClient
    from graphiti_core.driver.falkordb_driver import FalkorDriver
    from graphiti_core import Graphiti
    log.info(f"Connecting to FalkorDB at {FALKORDB_HOST}:{FALKORDB_PORT}...")
    graphiti_instance = Graphiti(
        llm_client=get_llm_client(),
        embedder=SentenceTransformerEmbedder(),
        cross_encoder=BGERerankerClient(),
        graph_driver=FalkorDriver(host=FALKORDB_HOST, port=FALKORDB_PORT),
    )
    await graphiti_instance.build_indices_and_constraints()
    log.info(f"Graphiti ready — provider: {LLM_PROVIDER}, group: {GROUP_ID}")
    yield
    await graphiti_instance.close()
 app = FastAPI(title="Aaron AI Graphiti Sidecar", lifespan=lifespan)
 class BulkEpisodeItem(BaseModel):
    name: str
    content: str
    source_description: str = ""
    timestamp: str | None = None
 class BulkEpisodeRequest(BaseModel):
    episodes: list[BulkEpisodeItem]
    group_id: str | None = None
 class EpisodeRequest(BaseModel):
    name: str
    content: str
    source_description: str = ""
    timestamp: str | None = None
    group_id: str | None = None
@app.get("/health")
 async def health():
    return {"ok": True, "provider": LLM_PROVIDER, "group": GROUP_ID}
@app.post("/episodes")
 async def add_episode(req: EpisodeRequest):
    g = await get_graphiti()
    from graphiti_core.nodes import EpisodeType
    try:
        ref_time = datetime.fromisoformat(req.timestamp) if req.timestamp else datetime.now()
        await g.add_episode(
            name=req.name,
            episode_body=req.content,
            source=EpisodeType.text,
            reference_time=ref_time,
            source_description=req.source_description,
            group_id=req.group_id or GROUP_ID,
        )
        return {"ok": True}
    except Exception as e:
        log.error(f"Episode ingestion failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
@app.post("/episodes/bulk")
 async def add_episodes_bulk(req: BulkEpisodeRequest):
    g = await get_graphiti()
    from graphiti_core.nodes import EpisodeType
    from graphiti_core.utils.bulk_utils import RawEpisode
    raw_episodes = []
    for ep in req.episodes:
        ref_time = datetime.fromisoformat(ep.timestamp) if ep.timestamp else datetime.now()
        raw_episodes.append(RawEpisode(
            name=ep.name,
            content=ep.content,
            source_description=ep.source_description,
            source=EpisodeType.text,
            reference_time=ref_time,
        ))
    try:
        result = await g.add_episode_bulk(
            bulk_episodes=raw_episodes,
            group_id=req.group_id or GROUP_ID,
        )
        return {"ok": True, "count": len(raw_episodes)}
    except Exception as e:
        log.error(f"Bulk ingestion failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
@app.get("/search")
 async def search(query: str, limit: int = 8, group_id: str | None = None):
    g = await get_graphiti()
    try:
        results = await g.search(
            query=query,
            num_results=limit,
            group_ids=[group_id or GROUP_ID],
        )
        return {
            "results": [
                {
                    "fact": r.fact,
                    "source": getattr(r, "source_node_uuid", ""),
                    "score": getattr(r, "score", 0),
                    "valid_at": str(getattr(r, "valid_at", "")),
                    "invalid_at": str(getattr(r, "invalid_at", "")),
                }
                for r in results
            ]
        }
    except Exception as e:
        log.error(f"Search failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="127.0.0.1", port=8001, log_level="info")
@@ -81,7 +81,7 @@ def enqueue_stage2(source, full_text):
                completed_at = NULL,
                failed_at = NULL,
                attempts = 0
-        """, (source, full_text[:50000], len(full_text)))
+        """, (source, full_text, len(full_text)))
        pg.commit()
        pg.close()
    except Exception as e:
@@ -0,0 +1,182 @@
 import os
 import sys
 import hashlib
 from pathlib import Path
 from dotenv import load_dotenv
 import psycopg2
 import psycopg2.extras
 import json
 from sentence_transformers import SentenceTransformer
 from docx import Document
 from pypdf import PdfReader
 from pptx import Presentation
 load_dotenv(Path.home() / "aaronai" / ".env", override=True)
 print("Loading embedding model...")
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
 PG_DSN = os.getenv("PG_DSN")
 def get_pg():
    return psycopg2.connect(PG_DSN)
 def extract_text_from_docx(path):
    doc = Document(path)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
 def extract_text_from_pdf(path):
    reader = PdfReader(path)
    text = ""
    for page in reader.pages:
        extracted = page.extract_text()
        if extracted:
            text += extracted + "\n"
    return text
 def extract_text_from_pptx(path):
    prs = Presentation(path)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                text += shape.text + "\n"
    return text
 def extract_text_from_txt(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()
 def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        if chunk.strip():
            chunks.append(chunk)
        start += chunk_size - overlap
    return chunks
 def make_id(filepath, chunk_index):
    path_hash = hashlib.md5(str(filepath).encode()).hexdigest()[:8]
    return f"{path_hash}_{chunk_index}"
 def enqueue_stage2(source, full_text):
    """Enqueue document for Stage 2 (Mistral orientation) → Stage 3 (Graphiti ingest).
    TEMPORARY: this queue feed will be removed when pgvector is decommissioned
    and the watcher calls Stage 2 directly.
    """
    try:
        pg = get_pg()
        cur = pg.cursor()
        cur.execute("""
            INSERT INTO stage_2_queue (source, full_text, char_length)
            VALUES (%s, %s, %s)
            ON CONFLICT (source) DO UPDATE SET
                full_text = EXCLUDED.full_text,
                char_length = EXCLUDED.char_length,
                enqueued_at = NOW(),
                completed_at = NULL,
                failed_at = NULL,
                attempts = 0
        """, (source, full_text[:50000], len(full_text)))
        pg.commit()
        pg.close()
    except Exception as e:
        print(f"  Stage 2 queue insert failed (non-fatal): {e}")
 def ingest_file(filepath):
    path = Path(filepath)
    suffix = path.suffix.lower()
    if path.name.startswith("~$") or path.name.startswith("."):
        return 0
    try:
        if suffix == ".docx":
            text = extract_text_from_docx(path)
        elif suffix == ".pdf":
            text = extract_text_from_pdf(path)
        elif suffix == ".pptx":
            text = extract_text_from_pptx(path)
        elif suffix in [".txt", ".md"]:
            text = extract_text_from_txt(path)
        else:
            return 0
        if not text.strip():
            return 0
        chunks = chunk_text(text)
        if not chunks:
            return 0
        embeddings = embedder.encode(chunks).tolist()
        ids = [make_id(path, i) for i in range(len(chunks))]
        metadatas = [{
            "source": path.name,
            "filepath": str(path),
            "folder": str(path.parent.relative_to(Path(sys.argv[1]) if len(sys.argv) > 1 else path.parent))
        } for _ in chunks]
        # STAGE 1: Write to pgvector (TEMPORARY — remove when chat agent migrates to Graphiti)
        pg = get_pg()
        cur = pg.cursor()
        for chunk_id, chunk, embedding, meta in zip(ids, chunks, embeddings, metadatas):
            cur.execute("""
                INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
                VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
                ON CONFLICT (id) DO UPDATE SET
                    document = EXCLUDED.document,
                    embedding = EXCLUDED.embedding,
                    source = EXCLUDED.source,
                    metadata = EXCLUDED.metadata
            """, (
                chunk_id, chunk, embedding,
                meta.get("source"), "document", None,
                json.dumps(meta)
            ))
        pg.commit()
        pg.close()
        print(f"  Indexed {len(chunks)} chunks: {path.name}")
        # Enqueue for Stage 2 → Stage 3 (Graphiti pipeline)
        # SKIP_STAGE2_ENQUEUE env var set by migration scripts to prevent bulk enqueue
        if not os.getenv("SKIP_STAGE2_ENQUEUE"):
            enqueue_stage2(path.name, text)
        return len(chunks)
    except Exception as e:
        print(f"  Error: {path.name}: {e}")
        return 0
 def ingest_folder(folder_path):
    folder = Path(folder_path)
    if not folder.exists():
        print(f"Folder not found: {folder_path}")
        sys.exit(1)
    supported = [".docx", ".pdf", ".pptx", ".txt", ".md"]
    files = [f for f in folder.rglob("*")
             if f.suffix.lower() in supported
             and not f.name.startswith("~$")
             and not f.name.startswith(".")]
    if not files:
        print("No supported files found.")
        sys.exit(1)
    print(f"Found {len(files)} files to process\n")
    total_chunks = 0
    for f in files:
        total_chunks += ingest_file(f)
    print(f"\nDone. Total chunks indexed: {total_chunks}")
 if __name__ == "__main__":
    target = sys.argv[1] if len(sys.argv) > 1 else str(Path.home() / "aaronai" / "docs")
    print(f"Ingesting from: {target}\n")
    ingest_folder(target)
@@ -135,7 +135,7 @@ def enqueue_stage2(source: str, full_text: str):
                completed_at = NULL,
                failed_at    = NULL,
                attempts     = 0
-        """, (source, full_text[:50000], len(full_text)))
+        """, (source, full_text, len(full_text)))
        pg.commit()
        pg.close()
    except Exception as e:
@@ -0,0 +1,210 @@
 import time
 import subprocess
 import logging
 import json
 import threading
 from pathlib import Path
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
 NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files"
 INGEST_SCRIPT = "/home/aaron/aaronai/scripts/ingest.py"
 PYTHON = "/home/aaron/aaronai/venv/bin/python3"
 LOG_FILE = "/home/aaron/aaronai/watcher.log"
 STATE_FILE = "/home/aaron/aaronai/watcher_state.json"
 SUPPORTED = {'.pdf', '.docx', '.pptx', '.txt', '.md'}
 DEBOUNCE_SECONDS = 120
 STATUS_FILE = "/home/aaron/aaronai/watcher_status.json"
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(message)s',
    handlers=[
        logging.FileHandler(LOG_FILE),
        logging.StreamHandler()
    ]
 )
 ingestion_state = {
    "status": "idle",
    "message": "",
    "file_count": 0,
    "started_at": None,
    "finished_at": None,
    "last_error": "",
 }
 ingestion_lock = threading.Lock()
 ingestion_thread = None
 def set_ingestion_state(**kwargs):
    with ingestion_lock:
        ingestion_state.update(kwargs)
 def load_state():
    if Path(STATE_FILE).exists():
        with open(STATE_FILE) as f:
            return json.load(f)
    return {}
 def save_state(state):
    with open(STATE_FILE, 'w') as f:
        json.dump(state, f)
 def get_changed_files():
    state = load_state()
    changed = []
    root = Path(NEXTCLOUD_PATH)
    for path in root.rglob("*"):
        if path.is_dir():
            continue
        if path.suffix.lower() not in SUPPORTED:
            continue
        if path.name.startswith('.') or path.name.startswith('~$'):
            continue
        mtime = str(path.stat().st_mtime)
        key = str(path)
        if state.get(key) != mtime:
            changed.append(path)
    return changed, state
 def run_ingestion():
    changed, state = get_changed_files()
    if not changed:
        logging.info("No new or changed files detected — skipping ingestion.")
        set_ingestion_state(status="idle", message="No changes detected", file_count=0)
        return
    count = len(changed)
    logging.info(f"Found {count} new or changed files — starting ingestion...")
    set_ingestion_state(
        status="ingesting",
        message=f"Ingesting {count} file(s)...",
        file_count=count,
        started_at=time.time(),
        finished_at=None,
        last_error="",
    )
    try:
        result = subprocess.run(
            [PYTHON, INGEST_SCRIPT, NEXTCLOUD_PATH],
            capture_output=True,
            text=True,
            timeout=1800
        )
        if result.returncode == 0:
            root = Path(NEXTCLOUD_PATH)
            for path in root.rglob("*"):
                if path.is_file() and path.suffix.lower() in SUPPORTED:
                    state[str(path)] = str(path.stat().st_mtime)
            save_state(state)
            logging.info("Ingestion complete. State updated.")
            set_ingestion_state(
                status="idle",
                message=f"Last run: ingested {count} file(s) successfully",
                finished_at=time.time(),
            )
        else:
            logging.error(f"Ingestion error: {result.stderr}")
            set_ingestion_state(
                status="error",
                message="Ingestion failed — see log",
                last_error=result.stderr[-300:],
                finished_at=time.time(),
            )
    except subprocess.TimeoutExpired:
        logging.error("Ingestion timed out.")
        set_ingestion_state(
            status="error",
            message="Ingestion timed out (>30 min)",
            last_error="TimeoutExpired",
            finished_at=time.time(),
        )
    except Exception as e:
        logging.error(f"Ingestion failed: {e}")
        set_ingestion_state(
            status="error",
            message=f"Ingestion exception: {e}",
            last_error=str(e),
            finished_at=time.time(),
        )
 def start_ingestion_thread():
    global ingestion_thread
    if ingestion_thread and ingestion_thread.is_alive():
        logging.info("Ingestion already running — skipping.")
        return
    ingestion_thread = threading.Thread(target=run_ingestion, daemon=True)
    ingestion_thread.start()
 class IngestHandler(FileSystemEventHandler):
    def __init__(self):
        self.pending = False
        self.last_event = 0
    def on_any_event(self, event):
        if event.is_directory:
            return
        path = Path(event.src_path)
        if path.suffix.lower() not in SUPPORTED:
            return
        if path.name.startswith('.') or path.name.startswith('~$'):
            return
        if 'Admin/Backups' in str(path) or 'Backups' in path.parts:
            return
        if 'Journal/Media' in str(path):
            return
        if event.event_type not in ('modified', 'created', 'moved'):
            return
        logging.info(f"Event: {event.event_type} {event.src_path}")
        self.pending = True
        self.last_event = time.time()
 def write_status(handler):
    with ingestion_lock:
        status = {
            "running": True,
            "timestamp": time.time(),
            "pending": handler.pending,
            "last_event": handler.last_event,
            "ingestion": dict(ingestion_state),
        }
    with open(STATUS_FILE, 'w') as f:
        json.dump(status, f)
 def main():
    logging.info("Aaron AI Watcher starting...")
    logging.info(f"Watching: {NEXTCLOUD_PATH}")
    handler = IngestHandler()
    observer = Observer()
    observer.schedule(handler, NEXTCLOUD_PATH, recursive=True)
    observer.start()
    try:
        while True:
            write_status(handler)
            if handler.pending:
                elapsed = time.time() - handler.last_event
                if elapsed >= DEBOUNCE_SECONDS:
                    handler.pending = False
                    start_ingestion_thread()
            time.sleep(5)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()
    logging.info("Watcher stopped.")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,448 @@
 """
 Aaron AI Watcher — Stage 1 of the encoding pipeline.
 Watches the Nextcloud directory for new or changed files.
 On detection, chunks + embeds documents in-process (no subprocess),
 then enqueues to stage_2_queue for async cascade processing.
 Design principles:
 - Embedding model loaded ONCE at startup, reused across all ingest runs
 - In-process ingest (no subprocess) — eliminates per-run model reload memory spike
 - Missed-file recovery on startup — ingests anything new since last state
 - Heartbeat file updated every loop tick — enables external health monitoring
 - Parity principle: no filtering, no decisions, faithful capture
 - Does NOT enqueue to stage_2_queue during bulk migration (SKIP_STAGE2_ENQUEUE env var)
 Architecture: Stage 1 (watcher) -> stage_2_queue -> Stage 2 (Mistral) -> stage_3_queue -> Stage 3 (Graphiti)
 """
 import os
 import time
 import json
 import hashlib
 import logging
 import threading
 from pathlib import Path
 import psycopg2
 from dotenv import load_dotenv
 from sentence_transformers import SentenceTransformer
 from watchdog.observers import Observer
 from watchdog.events import FileSystemEventHandler
 from docx import Document as DocxDocument
 from pypdf import PdfReader
 from pptx import Presentation
 load_dotenv(Path.home() / "aaronai" / ".env", override=True)
 NEXTCLOUD_PATH = "/home/aaron/nextcloud/data/data/aaron/files"
 LOG_FILE       = "/home/aaron/aaronai/watcher.log"
 STATE_FILE     = "/home/aaron/aaronai/watcher_state.json"
 STATUS_FILE    = "/home/aaron/aaronai/watcher_status.json"
 HEARTBEAT_FILE = "/home/aaron/aaronai/watcher_heartbeat"
 SUPPORTED        = {".pdf", ".docx", ".pptx", ".txt", ".md"}
 DEBOUNCE_SECONDS = 120
 CHUNK_SIZE       = 500
 CHUNK_OVERLAP    = 50
 EMBED_MODEL      = "all-MiniLM-L6-v2"
 PG_DSN = os.getenv("PG_DSN")
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [watcher] %(levelname)s %(message)s",
    handlers=[logging.FileHandler(LOG_FILE)],
 )
 log = logging.getLogger("watcher")
 ingestion_lock  = threading.Lock()
 ingestion_state = {
    "status": "idle", "message": "", "file_count": 0,
    "started_at": None, "finished_at": None, "last_error": "",
 }
 ingestion_thread = None
 def load_embedder():
    log.info(f"Loading embedding model: {EMBED_MODEL}")
    model = SentenceTransformer(EMBED_MODEL)
    log.info("Embedding model ready.")
    return model
 def get_pg():
    return psycopg2.connect(PG_DSN)
 def extract_text(path: Path) -> str:
    suffix = path.suffix.lower()
    try:
        if suffix == ".docx":
            doc = DocxDocument(path)
            return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
        elif suffix == ".pdf":
            reader = PdfReader(path)
            return "".join(
                page.extract_text() + "\n"
                for page in reader.pages if page.extract_text()
            )
        elif suffix == ".pptx":
            prs = Presentation(path)
            return "\n".join(
                shape.text for slide in prs.slides
                for shape in slide.shapes
                if hasattr(shape, "text") and shape.text.strip()
            )
        elif suffix in {".txt", ".md"}:
            return path.read_text(encoding="utf-8", errors="ignore")
    except Exception as e:
        log.warning(f"Text extraction failed for {path.name}: {e}")
        record_ingest_failure(path, f"Text extraction failed: {e}")
    return ""
 def chunk_text(text: str) -> list:
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        chunk = " ".join(words[start:start + CHUNK_SIZE])
        if chunk.strip():
            chunks.append(chunk)
        start += CHUNK_SIZE - CHUNK_OVERLAP
    return chunks
 def make_chunk_id(filepath: Path, chunk_index: int) -> str:
    return hashlib.md5(str(filepath).encode()).hexdigest()[:8] + f"_{chunk_index}"
 def enqueue_stage2(source: str, full_text: str):
    if os.getenv("SKIP_STAGE2_ENQUEUE"):
        return
    try:
        pg = get_pg()
        cur = pg.cursor()
        cur.execute("""
            INSERT INTO stage_2_queue (source, full_text, char_length)
            VALUES (%s, %s, %s)
            ON CONFLICT (source) DO UPDATE SET
                full_text    = EXCLUDED.full_text,
                char_length  = EXCLUDED.char_length,
                enqueued_at  = NOW(),
                completed_at = NULL,
                failed_at    = NULL,
                attempts     = 0
        """, (source, full_text[:50000], len(full_text)))
        pg.commit()
        pg.close()
    except Exception as e:
        log.warning(f"Stage 2 enqueue failed (non-fatal): {e}")
 def record_ingest_failure(filepath: Path, error: str):
    """Write extraction or ingest failure to ingest_failures table for UI visibility."""
    try:
        pg = get_pg()
        cur = pg.cursor()
        cur.execute("""
            INSERT INTO ingest_failures (source, filepath, error, retry_count, first_failed_at, last_failed_at)
            VALUES (%s, %s, %s, 0, NOW(), NOW())
            ON CONFLICT (source) DO UPDATE SET
                error          = EXCLUDED.error,
                retry_count    = ingest_failures.retry_count + 1,
                last_failed_at = NOW(),
                resolved       = FALSE
        """, (filepath.name, str(filepath), error[:1000]))
        pg.commit()
        pg.close()
    except Exception as e:
        log.warning(f"Could not record ingest failure (non-fatal): {e}")
 def resolve_ingest_failure(source: str):
    """Mark a previously failed file as resolved after successful ingest."""
    try:
        pg = get_pg()
        cur = pg.cursor()
        cur.execute("UPDATE ingest_failures SET resolved = TRUE WHERE source = %s", (source,))
        pg.commit()
        pg.close()
    except Exception as e:
        log.warning(f"Could not resolve ingest failure record (non-fatal): {e}")
 def ingest_file(filepath: Path, embedder) -> int:
    if filepath.name.startswith(("~$", ".")):
        return 0
    if filepath.suffix.lower() not in SUPPORTED:
        return 0
    text = extract_text(filepath)
    if not text.strip():
        return 0
    chunks = chunk_text(text)
    if not chunks:
        return 0
    try:
        embeddings = embedder.encode(chunks).tolist()
    except Exception as e:
        log.error(f"Embedding failed for {filepath.name}: {e}")
        record_ingest_failure(filepath, f"Embedding failed: {e}")
        return 0
    source = filepath.name
    try:
        pg  = get_pg()
        cur = pg.cursor()
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            chunk_id = make_chunk_id(filepath, i)
            cur.execute("""
                INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
                VALUES (%s, %s, %s::vector, %s, %s, NOW(), %s)
                ON CONFLICT (id) DO UPDATE SET
                    document  = EXCLUDED.document,
                    embedding = EXCLUDED.embedding,
                    source    = EXCLUDED.source,
                    metadata  = EXCLUDED.metadata
            """, (chunk_id, chunk, embedding, source, "document",
                  json.dumps({"source": source, "filepath": str(filepath)})))
        pg.commit()
        pg.close()
    except Exception as e:
        log.error(f"pgvector write failed for {filepath.name}: {e}")
        record_ingest_failure(filepath, f"pgvector write failed: {e}")
        return 0
    log.info(f"Indexed {len(chunks)} chunks: {filepath.name}")
    resolve_ingest_failure(source)
    enqueue_stage2(source, text)
    return len(chunks)
 def ingest_files(paths: list, embedder, state: dict) -> dict:
    total = 0
    for path in paths:
        count = ingest_file(path, embedder)
        total += count
        state[str(path)] = str(path.stat().st_mtime)
    log.info(f"Ingestion complete. {total} chunks across {len(paths)} files.")
    return state
 def load_state() -> dict:
    if Path(STATE_FILE).exists():
        try:
            with open(STATE_FILE) as f:
                return json.load(f)
        except Exception:
            pass
    return {}
 def save_state(state: dict):
    with open(STATE_FILE, "w") as f:
        json.dump(state, f)
 def get_changed_files(state: dict) -> list:
    changed = []
    root = Path(NEXTCLOUD_PATH)
    for path in root.rglob("*"):
        if path.is_dir():
            continue
        if path.suffix.lower() not in SUPPORTED:
            continue
        if path.name.startswith((".", "~$")):
            continue
        if "Admin/Backups" in str(path) or "Backups" in path.parts:
            continue
        if "Journal/Media" in str(path):
            continue
        if state.get(str(path)) != str(path.stat().st_mtime):
            changed.append(path)
    return changed
 def set_ingestion_state(**kwargs):
    with ingestion_lock:
        ingestion_state.update(kwargs)
 def write_status(handler):
    with ingestion_lock:
        status = {
            "running": True, "timestamp": time.time(),
            "pending": handler.pending, "last_event": handler.last_event,
            "ingestion": dict(ingestion_state),
        }
    try:
        with open(STATUS_FILE, "w") as f:
            json.dump(status, f)
    except Exception:
        pass
 def write_heartbeat():
    try:
        Path(HEARTBEAT_FILE).write_text(str(time.time()))
    except Exception:
        pass
 def run_ingestion(embedder):
    state   = load_state()
    changed = get_changed_files(state)
    if not changed:
        log.info("No new or changed files — skipping ingestion.")
        set_ingestion_state(status="idle", message="No changes detected", file_count=0)
        return
    count = len(changed)
    log.info(f"Found {count} new or changed files — starting ingestion...")
    set_ingestion_state(
        status="ingesting", message=f"Ingesting {count} file(s)...",
        file_count=count, started_at=time.time(), finished_at=None, last_error="",
    )
    try:
        state = ingest_files(changed, embedder, state)
        save_state(state)
        set_ingestion_state(
            status="idle",
            message=f"Last run: ingested {count} file(s) successfully",
            finished_at=time.time(),
        )
    except Exception as e:
        log.error(f"Ingestion failed: {e}")
        set_ingestion_state(
            status="error", message=f"Ingestion exception: {e}",
            last_error=str(e), finished_at=time.time(),
        )
 def start_ingestion_thread(embedder):
    global ingestion_thread
    with ingestion_lock:
        if ingestion_thread and ingestion_thread.is_alive():
            log.info("Ingestion already running — skipping.")
            return
        ingestion_thread = threading.Thread(
            target=run_ingestion, args=(embedder,), daemon=True
        )
        ingestion_thread.start()
 class IngestHandler(FileSystemEventHandler):
    def __init__(self):
        self.pending    = False
        self.last_event = 0
    def _should_ignore(self, path: Path) -> bool:
        if path.name.startswith((".", "~$")):
            return True
        if "Admin/Backups" in str(path) or "Backups" in path.parts:
            return True
        if "Journal/Media" in str(path):
            return True
        return False
    def on_created(self, event):
        if event.is_directory:
            return
        path = Path(event.src_path)
        if path.suffix.lower() not in SUPPORTED or self._should_ignore(path):
            return
        log.info(f"Event: created {path}")
        self.pending = True
        self.last_event = time.time()
    def on_modified(self, event):
        if event.is_directory:
            return
        path = Path(event.src_path)
        if path.suffix.lower() not in SUPPORTED or self._should_ignore(path):
            return
        log.info(f"Event: modified {path}")
        self.pending = True
        self.last_event = time.time()
    def on_moved(self, event):
        if event.is_directory:
            return
        # Nextcloud WebDAV writes .part temp files then renames to final path.
        # src_path is the .part file; dest_path is the final filename.
        dest = Path(event.dest_path)
        if dest.suffix.lower() not in SUPPORTED or self._should_ignore(dest):
            return
        log.info(f"Event: moved -> {dest}")
        self.pending = True
        self.last_event = time.time()
    def on_closed(self, event):
        # FileClosedEvent fires on the final file after Nextcloud completes write.
        # Belt-and-suspenders catch for any write pattern not caught by on_moved.
        if event.is_directory:
            return
        path = Path(event.src_path)
        if path.suffix.lower() not in SUPPORTED or self._should_ignore(path):
            return
        log.info(f"Event: closed {path}")
        self.pending = True
        self.last_event = time.time()
 def main():
    log.info("Aaron AI Watcher starting...")
    log.info(f"Watching: {NEXTCLOUD_PATH}")
    embedder = load_embedder()
    log.info("Startup scan: checking for files missed since last run...")
    state  = load_state()
    missed = get_changed_files(state)
    if missed:
        log.info(f"Startup recovery: {len(missed)} missed file(s) — ingesting now.")
        set_ingestion_state(
            status="ingesting",
            message=f"Startup recovery: ingesting {len(missed)} missed file(s)...",
            file_count=len(missed), started_at=time.time(),
        )
        try:
            state = ingest_files(missed, embedder, state)
            save_state(state)
            set_ingestion_state(
                status="idle",
                message=f"Startup recovery complete: {len(missed)} file(s) ingested.",
                finished_at=time.time(),
            )
        except Exception as e:
            log.error(f"Startup recovery failed: {e}")
            set_ingestion_state(status="error", message=str(e),
                                last_error=str(e), finished_at=time.time())
    else:
        log.info("Startup scan: no missed files.")
    handler  = IngestHandler()
    observer = Observer()
    observer.schedule(handler, NEXTCLOUD_PATH, recursive=True)
    observer.start()
    log.info("Observer started.")
    try:
        while True:
            write_heartbeat()
            write_status(handler)
            if handler.pending:
                elapsed = time.time() - handler.last_event
                if elapsed >= DEBOUNCE_SECONDS:
                    handler.pending = False
                    start_ingestion_thread(embedder)
            time.sleep(5)
    except KeyboardInterrupt:
        log.info("KeyboardInterrupt — stopping.")
        observer.stop()
    observer.join()
    log.info("Watcher stopped.")
 if __name__ == "__main__":
    main()
@@ -0,0 +1 @@
 1777602395.781194