Code review fixes: CV pinning, F1 (excluded_sources), F14 (50KB truncation), F37

- api.py: strip CV pinning workaround (parity violation, see architecture doc)
- dream.py: F1 — retrieve_graphiti() now accepts excluded_sources, over-fetches
  3x and filters in-process. Was silently dropping the parameter; would have
  confounded E3 with broken cross-stage exclusion in Graphiti arm.
- watcher.py + ingest.py: F14 — drop full_text[:50000] truncation. Was
  propagating through entire cascade. Postgres TEXT can hold up to 1GB.
- corpus_integrity.py: F37 — same truncation, third path now clean.

Backups: api.py.bak.*, dream.py.bak.*, watcher.py.bak.*, ingest.py.bak.*,
corpus_integrity.py.bak.* timestamped pre-fix.

Re-cascaded Shop Class as Soulcraft (only already-cascaded source affected
by F14, 414KB).
This commit is contained in:
2026-05-01 02:26:37 +00:00
parent 25e42c0231
commit 465f2f725b
17 changed files with 4432 additions and 58 deletions
+5 -34
View File
@@ -161,8 +161,6 @@ def require_auth(request: Request):
raise HTTPException(status_code=401, detail="Not authenticated")
return token
CV_SOURCES = ["Aaron Nelson CV 2024.pdf", "Aaron Nelson CV 2025.pdf", "Aaron Nelson - CV.docx"]
def init_conversations_db():
conn = sqlite3.connect(CONVERSATIONS_DB)
c = conn.cursor()
@@ -224,50 +222,23 @@ def remove_from_memory(item):
save_memory("\n".join(filtered))
return len(lines) - len(filtered)
def get_pinned_cv_context():
try:
pg = get_pg()
cur = pg.cursor()
cur.execute(
"SELECT document, source FROM embeddings WHERE source = ANY(%s)",
(CV_SOURCES,)
)
rows = cur.fetchall()
pg.close()
docs = [r[0] for r in rows]
metas = [{"source": r[1]} for r in rows]
return docs, metas
except:
return [], []
def is_professional_query(query):
keywords = ["grant", "publication", "exhibition", "award", "fellowship",
"experience", "position", "job", "career", "cv", "resume",
"research", "work history", "accomplishment", "teaching",
"course", "client", "consultation", "presentation", "workshop",
"education", "degree", "institution", "service", "committee"]
return any(k in query.lower() for k in keywords)
def retrieve_context(query, n_results=8):
"""Pure semantic retrieval over pgvector. Top-N by cosine similarity, threshold 0.3.
No CV pinning, no keyword routing — see architecture doc substrate-dependency section.
Substrate-level workarounds (entity-keyed routing, hybrid retrieval) live at the
Graphiti layer, not as wrapper logic above pgvector."""
query_embedding = embedder.encode([query]).tolist()[0]
context_pieces = []
sources = []
if is_professional_query(query):
cv_docs, cv_metas = get_pinned_cv_context()
for doc, meta in zip(cv_docs, cv_metas):
context_pieces.append(f"[CV] {doc}")
sources.append(meta.get("source", "CV"))
try:
pg = get_pg()
cur = pg.cursor()
cur.execute("""
SELECT document, source, 1 - (embedding <=> %s::vector) as similarity
FROM embeddings
WHERE source NOT IN %s
ORDER BY embedding <=> %s::vector
LIMIT %s
""", (query_embedding, tuple(CV_SOURCES) if CV_SOURCES else ('__none__',),
query_embedding, n_results))
""", (query_embedding, query_embedding, n_results))
for doc, source, similarity in cur.fetchall():
if similarity > 0.3:
context_pieces.append(doc)