Code review fixes: CV pinning, F1 (excluded_sources), F14 (50KB truncation), F37
- api.py: strip CV pinning workaround (parity violation, see architecture doc) - dream.py: F1 — retrieve_graphiti() now accepts excluded_sources, over-fetches 3x and filters in-process. Was silently dropping the parameter; would have confounded E3 with broken cross-stage exclusion in Graphiti arm. - watcher.py + ingest.py: F14 — drop full_text[:50000] truncation. Was propagating through entire cascade. Postgres TEXT can hold up to 1GB. - corpus_integrity.py: F37 — same truncation, third path now clean. Backups: api.py.bak.*, dream.py.bak.*, watcher.py.bak.*, ingest.py.bak.*, corpus_integrity.py.bak.* timestamped pre-fix. Re-cascaded Shop Class as Soulcraft (only already-cascaded source affected by F14, 414KB).
This commit is contained in:
+22
-4
@@ -111,11 +111,16 @@ def get_recent_conversation_topics(days=14):
|
||||
# ─── Stage 2: Retrieve ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def retrieve_graphiti(mode, task=None, n_results=8):
|
||||
def retrieve_graphiti(mode, task=None, n_results=8, excluded_sources=None):
|
||||
"""E3 experiment — Graphiti substrate retrieval.
|
||||
Queries Graphiti /search endpoint instead of pgvector.
|
||||
Returns chunks in same format as retrieve() for pipeline compatibility.
|
||||
Note: content is Graphiti facts (synthesized relationships), not raw chunks.
|
||||
|
||||
Over-fetches by 3x to allow in-process filtering against excluded_sources,
|
||||
matching the cross-stage exclusion mechanism the pgvector branch uses.
|
||||
Without this filter, NREM/Early REM/Late REM would see overlapping content
|
||||
and the score-band Early REM exclusion (v1.1) would not apply in Graphiti mode.
|
||||
"""
|
||||
import requests as req_lib
|
||||
if task:
|
||||
@@ -129,25 +134,38 @@ def retrieve_graphiti(mode, task=None, n_results=8):
|
||||
else:
|
||||
query = "research fabrication teaching practice recent work"
|
||||
|
||||
excluded_sources = excluded_sources or set()
|
||||
# Over-fetch so in-process exclusion still leaves enough results
|
||||
fetch_limit = n_results * 3 if excluded_sources else n_results
|
||||
|
||||
try:
|
||||
resp = req_lib.get(
|
||||
"http://localhost:8001/search",
|
||||
params={"query": query, "limit": n_results, "group_id": "aaron"},
|
||||
params={"query": query, "limit": fetch_limit, "group_id": "aaron"},
|
||||
timeout=30,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
results = resp.json().get("results", [])
|
||||
chunks = []
|
||||
seen_sources = set()
|
||||
for r in results:
|
||||
fact = r.get("fact", "")
|
||||
if not fact.strip():
|
||||
continue
|
||||
source = r.get("source", "graphiti")
|
||||
if source in excluded_sources:
|
||||
continue
|
||||
if source in seen_sources:
|
||||
continue
|
||||
chunks.append({
|
||||
"source": r.get("source", "graphiti"),
|
||||
"source": source,
|
||||
"content": fact,
|
||||
"relevance": r.get("score", 0.5),
|
||||
"similarity": r.get("score", 0.5),
|
||||
})
|
||||
seen_sources.add(source)
|
||||
if len(chunks) >= n_results:
|
||||
break
|
||||
return chunks
|
||||
except Exception as e:
|
||||
print(f"[Graphiti retrieval error: {e}] — falling back to empty.")
|
||||
@@ -158,7 +176,7 @@ def retrieve(mode, task=None, n_results=8, excluded_sources=None):
|
||||
# Default behavior: pgvector similarity search (unchanged)
|
||||
substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector")
|
||||
if substrate == "graphiti":
|
||||
return retrieve_graphiti(mode, task=task, n_results=n_results)
|
||||
return retrieve_graphiti(mode, task=task, n_results=n_results, excluded_sources=excluded_sources)
|
||||
from sentence_transformers import SentenceTransformer
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
low, high = MODE_RANGES[mode]
|
||||
|
||||
Reference in New Issue
Block a user