Code review fixes: CV pinning, F1 (excluded_sources), F14 (50KB truncation), F37

- api.py: strip CV pinning workaround (parity violation, see architecture doc) - dream.py: F1 — retrieve_graphiti() now accepts excluded_sources, over-fetches 3x and filters in-process. Was silently dropping the parameter; would have confounded E3 with broken cross-stage exclusion in Graphiti arm. - watcher.py + ingest.py: F14 — drop full_text[:50000] truncation. Was propagating through entire cascade. Postgres TEXT can hold up to 1GB. - corpus_integrity.py: F37 — same truncation, third path now clean. Backups: api.py.bak.*, dream.py.bak.*, watcher.py.bak.*, ingest.py.bak.*, corpus_integrity.py.bak.* timestamped pre-fix. Re-cascaded Shop Class as Soulcraft (only already-cascaded source affected by F14, 414KB).
2026-05-01 02:26:37 +00:00
parent 25e42c0231
commit 465f2f725b
17 changed files with 4432 additions and 58 deletions
@@ -111,11 +111,16 @@ def get_recent_conversation_topics(days=14):
 # ─── Stage 2: Retrieve ──────────────────────────────────────────────────────


-def retrieve_graphiti(mode, task=None, n_results=8):
+def retrieve_graphiti(mode, task=None, n_results=8, excluded_sources=None):
    """E3 experiment — Graphiti substrate retrieval.
    Queries Graphiti /search endpoint instead of pgvector.
    Returns chunks in same format as retrieve() for pipeline compatibility.
    Note: content is Graphiti facts (synthesized relationships), not raw chunks.
+
+    Over-fetches by 3x to allow in-process filtering against excluded_sources,
+    matching the cross-stage exclusion mechanism the pgvector branch uses.
+    Without this filter, NREM/Early REM/Late REM would see overlapping content
+    and the score-band Early REM exclusion (v1.1) would not apply in Graphiti mode.
    """
    import requests as req_lib
    if task:
@@ -129,25 +134,38 @@ def retrieve_graphiti(mode, task=None, n_results=8):
    else:
        query = "research fabrication teaching practice recent work"

+    excluded_sources = excluded_sources or set()
+    # Over-fetch so in-process exclusion still leaves enough results
+    fetch_limit = n_results * 3 if excluded_sources else n_results
+
    try:
        resp = req_lib.get(
            "http://localhost:8001/search",
-            params={"query": query, "limit": n_results, "group_id": "aaron"},
+            params={"query": query, "limit": fetch_limit, "group_id": "aaron"},
            timeout=30,
        )
        resp.raise_for_status()
        results = resp.json().get("results", [])
        chunks = []
+        seen_sources = set()
        for r in results:
            fact = r.get("fact", "")
            if not fact.strip():
                continue
+            source = r.get("source", "graphiti")
+            if source in excluded_sources:
+                continue
+            if source in seen_sources:
+                continue
            chunks.append({
-                "source": r.get("source", "graphiti"),
+                "source": source,
                "content": fact,
                "relevance": r.get("score", 0.5),
                "similarity": r.get("score", 0.5),
            })
+            seen_sources.add(source)
+            if len(chunks) >= n_results:
+                break
        return chunks
    except Exception as e:
        print(f"[Graphiti retrieval error: {e}] — falling back to empty.")
@@ -158,7 +176,7 @@ def retrieve(mode, task=None, n_results=8, excluded_sources=None):
    # Default behavior: pgvector similarity search (unchanged)
    substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector")
    if substrate == "graphiti":
-        return retrieve_graphiti(mode, task=task, n_results=n_results)
+        return retrieve_graphiti(mode, task=task, n_results=n_results, excluded_sources=excluded_sources)
    from sentence_transformers import SentenceTransformer
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    low, high = MODE_RANGES[mode]