Code review fixes: CV pinning, F1 (excluded_sources), F14 (50KB truncation), F37

- api.py: strip CV pinning workaround (parity violation, see architecture doc) - dream.py: F1 — retrieve_graphiti() now accepts excluded_sources, over-fetches 3x and filters in-process. Was silently dropping the parameter; would have confounded E3 with broken cross-stage exclusion in Graphiti arm. - watcher.py + ingest.py: F14 — drop full_text[:50000] truncation. Was propagating through entire cascade. Postgres TEXT can hold up to 1GB. - corpus_integrity.py: F37 — same truncation, third path now clean. Backups: api.py.bak.*, dream.py.bak.*, watcher.py.bak.*, ingest.py.bak.*, corpus_integrity.py.bak.* timestamped pre-fix. Re-cascaded Shop Class as Soulcraft (only already-cascaded source affected by F14, 414KB).
2026-05-01 02:26:37 +00:00
parent 25e42c0231
commit 465f2f725b
17 changed files with 4432 additions and 58 deletions
@@ -161,8 +161,6 @@ def require_auth(request: Request):
        raise HTTPException(status_code=401, detail="Not authenticated")
    return token

-CV_SOURCES = ["Aaron Nelson CV 2024.pdf", "Aaron Nelson CV 2025.pdf", "Aaron Nelson - CV.docx"]
-
 def init_conversations_db():
    conn = sqlite3.connect(CONVERSATIONS_DB)
    c = conn.cursor()
@@ -224,50 +222,23 @@ def remove_from_memory(item):
    save_memory("\n".join(filtered))
    return len(lines) - len(filtered)

-def get_pinned_cv_context():
-    try:
-        pg = get_pg()
-        cur = pg.cursor()
-        cur.execute(
-            "SELECT document, source FROM embeddings WHERE source = ANY(%s)",
-            (CV_SOURCES,)
-        )
-        rows = cur.fetchall()
-        pg.close()
-        docs = [r[0] for r in rows]
-        metas = [{"source": r[1]} for r in rows]
-        return docs, metas
-    except:
-        return [], []
-
-def is_professional_query(query):
-    keywords = ["grant", "publication", "exhibition", "award", "fellowship",
-        "experience", "position", "job", "career", "cv", "resume",
-        "research", "work history", "accomplishment", "teaching",
-        "course", "client", "consultation", "presentation", "workshop",
-        "education", "degree", "institution", "service", "committee"]
-    return any(k in query.lower() for k in keywords)
-
 def retrieve_context(query, n_results=8):
+    """Pure semantic retrieval over pgvector. Top-N by cosine similarity, threshold 0.3.
+    No CV pinning, no keyword routing — see architecture doc substrate-dependency section.
+    Substrate-level workarounds (entity-keyed routing, hybrid retrieval) live at the
+    Graphiti layer, not as wrapper logic above pgvector."""
    query_embedding = embedder.encode([query]).tolist()[0]
    context_pieces = []
    sources = []
-    if is_professional_query(query):
-        cv_docs, cv_metas = get_pinned_cv_context()
-        for doc, meta in zip(cv_docs, cv_metas):
-            context_pieces.append(f"[CV] {doc}")
-            sources.append(meta.get("source", "CV"))
    try:
        pg = get_pg()
        cur = pg.cursor()
        cur.execute("""
            SELECT document, source, 1 - (embedding <=> %s::vector) as similarity
            FROM embeddings
-            WHERE source NOT IN %s
            ORDER BY embedding <=> %s::vector
            LIMIT %s
-        """, (query_embedding, tuple(CV_SOURCES) if CV_SOURCES else ('__none__',),
-              query_embedding, n_results))
+        """, (query_embedding, query_embedding, n_results))
        for doc, source, similarity in cur.fetchall():
            if similarity > 0.3:
                context_pieces.append(doc)