Code review fixes: CV pinning, F1 (excluded_sources), F14 (50KB truncation), F37
- api.py: strip CV pinning workaround (parity violation, see architecture doc) - dream.py: F1 — retrieve_graphiti() now accepts excluded_sources, over-fetches 3x and filters in-process. Was silently dropping the parameter; would have confounded E3 with broken cross-stage exclusion in Graphiti arm. - watcher.py + ingest.py: F14 — drop full_text[:50000] truncation. Was propagating through entire cascade. Postgres TEXT can hold up to 1GB. - corpus_integrity.py: F37 — same truncation, third path now clean. Backups: api.py.bak.*, dream.py.bak.*, watcher.py.bak.*, ingest.py.bak.*, corpus_integrity.py.bak.* timestamped pre-fix. Re-cascaded Shop Class as Soulcraft (only already-cascaded source affected by F14, 414KB).
This commit is contained in:
+5
-34
@@ -161,8 +161,6 @@ def require_auth(request: Request):
|
||||
raise HTTPException(status_code=401, detail="Not authenticated")
|
||||
return token
|
||||
|
||||
CV_SOURCES = ["Aaron Nelson CV 2024.pdf", "Aaron Nelson CV 2025.pdf", "Aaron Nelson - CV.docx"]
|
||||
|
||||
def init_conversations_db():
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
c = conn.cursor()
|
||||
@@ -224,50 +222,23 @@ def remove_from_memory(item):
|
||||
save_memory("\n".join(filtered))
|
||||
return len(lines) - len(filtered)
|
||||
|
||||
def get_pinned_cv_context():
|
||||
try:
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute(
|
||||
"SELECT document, source FROM embeddings WHERE source = ANY(%s)",
|
||||
(CV_SOURCES,)
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
pg.close()
|
||||
docs = [r[0] for r in rows]
|
||||
metas = [{"source": r[1]} for r in rows]
|
||||
return docs, metas
|
||||
except:
|
||||
return [], []
|
||||
|
||||
def is_professional_query(query):
|
||||
keywords = ["grant", "publication", "exhibition", "award", "fellowship",
|
||||
"experience", "position", "job", "career", "cv", "resume",
|
||||
"research", "work history", "accomplishment", "teaching",
|
||||
"course", "client", "consultation", "presentation", "workshop",
|
||||
"education", "degree", "institution", "service", "committee"]
|
||||
return any(k in query.lower() for k in keywords)
|
||||
|
||||
def retrieve_context(query, n_results=8):
|
||||
"""Pure semantic retrieval over pgvector. Top-N by cosine similarity, threshold 0.3.
|
||||
No CV pinning, no keyword routing — see architecture doc substrate-dependency section.
|
||||
Substrate-level workarounds (entity-keyed routing, hybrid retrieval) live at the
|
||||
Graphiti layer, not as wrapper logic above pgvector."""
|
||||
query_embedding = embedder.encode([query]).tolist()[0]
|
||||
context_pieces = []
|
||||
sources = []
|
||||
if is_professional_query(query):
|
||||
cv_docs, cv_metas = get_pinned_cv_context()
|
||||
for doc, meta in zip(cv_docs, cv_metas):
|
||||
context_pieces.append(f"[CV] {doc}")
|
||||
sources.append(meta.get("source", "CV"))
|
||||
try:
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute("""
|
||||
SELECT document, source, 1 - (embedding <=> %s::vector) as similarity
|
||||
FROM embeddings
|
||||
WHERE source NOT IN %s
|
||||
ORDER BY embedding <=> %s::vector
|
||||
LIMIT %s
|
||||
""", (query_embedding, tuple(CV_SOURCES) if CV_SOURCES else ('__none__',),
|
||||
query_embedding, n_results))
|
||||
""", (query_embedding, query_embedding, n_results))
|
||||
for doc, source, similarity in cur.fetchall():
|
||||
if similarity > 0.3:
|
||||
context_pieces.append(doc)
|
||||
|
||||
Reference in New Issue
Block a user