Code review fixes: CV pinning, F1 (excluded_sources), F14 (50KB truncation), F37

- api.py: strip CV pinning workaround (parity violation, see architecture doc)
- dream.py: F1 — retrieve_graphiti() now accepts excluded_sources, over-fetches
  3x and filters in-process. Was silently dropping the parameter; would have
  confounded E3 with broken cross-stage exclusion in Graphiti arm.
- watcher.py + ingest.py: F14 — drop full_text[:50000] truncation. Was
  propagating through entire cascade. Postgres TEXT can hold up to 1GB.
- corpus_integrity.py: F37 — same truncation, third path now clean.

Backups: api.py.bak.*, dream.py.bak.*, watcher.py.bak.*, ingest.py.bak.*,
corpus_integrity.py.bak.* timestamped pre-fix.

Re-cascaded Shop Class as Soulcraft (only already-cascaded source affected
by F14, 414KB).
This commit is contained in:
2026-05-01 02:26:37 +00:00
parent 25e42c0231
commit 465f2f725b
17 changed files with 4432 additions and 58 deletions
+22 -4
View File
@@ -111,11 +111,16 @@ def get_recent_conversation_topics(days=14):
# ─── Stage 2: Retrieve ──────────────────────────────────────────────────────
def retrieve_graphiti(mode, task=None, n_results=8):
def retrieve_graphiti(mode, task=None, n_results=8, excluded_sources=None):
"""E3 experiment — Graphiti substrate retrieval.
Queries Graphiti /search endpoint instead of pgvector.
Returns chunks in same format as retrieve() for pipeline compatibility.
Note: content is Graphiti facts (synthesized relationships), not raw chunks.
Over-fetches by 3x to allow in-process filtering against excluded_sources,
matching the cross-stage exclusion mechanism the pgvector branch uses.
Without this filter, NREM/Early REM/Late REM would see overlapping content
and the score-band Early REM exclusion (v1.1) would not apply in Graphiti mode.
"""
import requests as req_lib
if task:
@@ -129,25 +134,38 @@ def retrieve_graphiti(mode, task=None, n_results=8):
else:
query = "research fabrication teaching practice recent work"
excluded_sources = excluded_sources or set()
# Over-fetch so in-process exclusion still leaves enough results
fetch_limit = n_results * 3 if excluded_sources else n_results
try:
resp = req_lib.get(
"http://localhost:8001/search",
params={"query": query, "limit": n_results, "group_id": "aaron"},
params={"query": query, "limit": fetch_limit, "group_id": "aaron"},
timeout=30,
)
resp.raise_for_status()
results = resp.json().get("results", [])
chunks = []
seen_sources = set()
for r in results:
fact = r.get("fact", "")
if not fact.strip():
continue
source = r.get("source", "graphiti")
if source in excluded_sources:
continue
if source in seen_sources:
continue
chunks.append({
"source": r.get("source", "graphiti"),
"source": source,
"content": fact,
"relevance": r.get("score", 0.5),
"similarity": r.get("score", 0.5),
})
seen_sources.add(source)
if len(chunks) >= n_results:
break
return chunks
except Exception as e:
print(f"[Graphiti retrieval error: {e}] — falling back to empty.")
@@ -158,7 +176,7 @@ def retrieve(mode, task=None, n_results=8, excluded_sources=None):
# Default behavior: pgvector similarity search (unchanged)
substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector")
if substrate == "graphiti":
return retrieve_graphiti(mode, task=task, n_results=n_results)
return retrieve_graphiti(mode, task=task, n_results=n_results, excluded_sources=excluded_sources)
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")
low, high = MODE_RANGES[mode]