api.py: folder-aware retrieval, near-duplicate dedup, folder in citations

Three refinements to retrieve_context, all keyed off observed failures from
test_retrieval.py:

- Library/personal split. classify_retrieval_intent now returns
  (type_filter, folder_exclude_prefixes). Biographical document intent excludes
  Library/* so philosophy/cognition books stop crowding out CVs and dossiers
  for queries like "write me a bio".

- Near-duplicate collapse. Multi-folder copies of the same file (e.g., several
  Teaching Philosophy.pdf in different application folders) used to fill the
  top-N with the same content. Dedup by first-300-chars hash after rerank.

- Folder in source citations. Surface metadata.folder alongside basename so
  the LLM can disambiguate among 21 CV.docx variants and the user can see
  which copy a citation refers to.

Also: bump hnsw.ef_search to 500 when a WHERE filter is present.
pgvector 0.6 doesn't iterate past its initial HNSW candidate list, so a
restrictive filter that excludes the nearest neighbors otherwise returns
empty.
This commit is contained in:
2026-05-19 21:35:28 +00:00
parent 8d560f9f5e
commit 50b97e2998
2 changed files with 83 additions and 33 deletions
+78 -30
View File
@@ -277,25 +277,32 @@ def _websearch_query(text: str) -> str:
def classify_retrieval_intent(query: str):
"""Return a list of `type` values to filter retrieval on, or None for all types.
"""Return (type_filter, folder_exclude_prefixes). Either may be None.
type_filter restricts the candidate pool by `type`; folder_exclude_prefixes
excludes any chunk whose metadata.folder matches a LIKE 'prefix%' pattern.
Implementation is a low-effort keyword classifier — explicitly tunable and
swappable. For more nuanced routing, replace this with an LLM classifier call
that returns the same shape: a list of valid type strings or None.
swappable. For nuanced routing, replace with an LLM classifier returning
the same shape.
Precedence: conversation signals win over document signals — a question like
"what did I tell you about my CV" is asking about the conversation, not the CV.
"""
Precedence: conversation signals win over document signals — "what did I
tell you about my CV" is asking about the conversation, not the CV.
For biographical/document intent, also exclude the reference library
(Library/Foundations/* — philosophy and cognition books), which is
categorically different from personal artifacts but lives in the same
`type='document'` bucket."""
q = query.lower()
if any(s in q for s in _CONVO_SIGNALS):
return CONVERSATION_TYPES
return (CONVERSATION_TYPES, None)
if any(s in q for s in _DOC_SIGNALS):
return DOCUMENT_TYPES
return None
return (DOCUMENT_TYPES, ["Library/"])
return (None, None)
def _rerank(query: str, candidates: list[tuple]) -> list[tuple]:
"""Cross-encoder rerank. Candidates are (id, document, source) tuples.
"""Cross-encoder rerank. Candidates are (id, document, source, folder) tuples.
Returns the same tuples reordered by reranker score (highest first)."""
if not candidates:
return []
@@ -305,7 +312,25 @@ def _rerank(query: str, candidates: list[tuple]) -> list[tuple]:
key=lambda x: x[1], reverse=True)]
def retrieve_context(query, n_results=FINAL_LIMIT, type_filter=None):
def _format_source(source: str, folder: str) -> str:
"""Surface folder context to the LLM so it can disambiguate same-named files
(e.g., 21 different CV.docx files across job-application folders)."""
source = source or "unknown"
if folder and folder not in ("", "."):
return f"{folder}/{source}"
return source
def _dedup_key(doc: str) -> str:
"""Collapse near-duplicates by content. Files copied to multiple folders
produce byte-identical chunks; this catches those without affecting
legitimately-different chunks of the same source (e.g., separate sections
of a conversation)."""
return hashlib.md5(doc[:300].lower().encode("utf-8", "ignore")).hexdigest()
def retrieve_context(query, n_results=FINAL_LIMIT,
type_filter=None, folder_exclude_prefixes=None):
"""Hybrid retrieval (dense + lexical, RRF fused) followed by cross-encoder rerank.
- Dense (pgvector) handles paraphrase / semantic similarity.
@@ -314,48 +339,61 @@ def retrieve_context(query, n_results=FINAL_LIMIT, type_filter=None):
- RRF combines the two rankings without calibrating score scales.
- Cross-encoder rerank scores each (query, chunk) pair jointly, bridging
semantic gaps that bi-encoders can't (e.g., "write me a bio" -> CV chunk).
- Near-duplicate collapse on output so top-N slots aren't burned by
multi-folder copies of the same file.
type_filter: optional list of `type` values to restrict the candidate pool to.
If None, retrieves from all types. Use classify_retrieval_intent() to derive."""
folder_exclude_prefixes: optional list of folder LIKE prefixes to exclude.
Both default to None (no restriction). Use classify_retrieval_intent() to derive."""
query_embedding = embedder.encode([query]).tolist()[0]
ts_query = _websearch_query(query)
context_pieces = []
sources = []
where_sql = ""
type_param = ()
where_clauses = []
extra_params = []
if type_filter:
where_sql = "WHERE type = ANY(%s)"
type_param = (list(type_filter),)
where_clauses.append("type = ANY(%s)")
extra_params.append(list(type_filter))
for prefix in (folder_exclude_prefixes or []):
where_clauses.append("(metadata->>'folder' IS NULL OR metadata->>'folder' NOT LIKE %s)")
extra_params.append(prefix + "%")
common_where = ("WHERE " + " AND ".join(where_clauses)) if where_clauses else ""
try:
pg = get_pg()
cur = pg.cursor()
# pgvector 0.6 HNSW doesn't iterate past its initial candidate list when
# a restrictive WHERE filter is present — so a filter that excludes the
# top-N nearest leaves nothing. Bumping ef_search forces the index to
# explore more graph nodes. Cheap when unfiltered; load-bearing when filtered.
if where_clauses:
cur.execute("SET LOCAL hnsw.ef_search = 500")
cur.execute(f"""
SELECT id, document, source
SELECT id, document, source, metadata->>'folder' AS folder
FROM embeddings
{where_sql}
{common_where}
ORDER BY embedding <=> %s::vector
LIMIT %s
""", (*type_param, query_embedding, HYBRID_CANDIDATES))
""", (*extra_params, query_embedding, HYBRID_CANDIDATES))
dense_hits = cur.fetchall()
lexical_hits = []
if ts_query:
lex_where = "to_tsvector('english', document) @@ websearch_to_tsquery('english', %s)"
full_where = (f"WHERE {lex_where} AND type = ANY(%s)"
if type_filter else f"WHERE {lex_where}")
lex_params = ((ts_query, list(type_filter)) if type_filter else (ts_query,))
lex_match = "to_tsvector('english', document) @@ websearch_to_tsquery('english', %s)"
lex_where = ("WHERE " + " AND ".join([lex_match] + where_clauses))
cur.execute(f"""
SELECT id, document, source
SELECT id, document, source, metadata->>'folder' AS folder
FROM embeddings
{full_where}
{lex_where}
ORDER BY ts_rank(to_tsvector('english', document),
websearch_to_tsquery('english', %s)) DESC
LIMIT %s
""", (*lex_params, ts_query, HYBRID_CANDIDATES))
""", (ts_query, *extra_params, ts_query, HYBRID_CANDIDATES))
lexical_hits = cur.fetchall()
pg.close()
@@ -372,9 +410,16 @@ def retrieve_context(query, n_results=FINAL_LIMIT, type_filter=None):
rrf_ranked = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
candidates = [rows_by_id[doc_id] for doc_id, _ in rrf_ranked]
for _id, doc, source in _rerank(query, candidates)[:n_results]:
seen = set()
for _id, doc, source, folder in _rerank(query, candidates):
key = _dedup_key(doc)
if key in seen:
continue
seen.add(key)
context_pieces.append(doc)
sources.append(source or "unknown")
sources.append(_format_source(source, folder))
if len(context_pieces) >= n_results:
break
except Exception as e:
print(f"hybrid retrieval error: {e}")
@@ -418,8 +463,11 @@ def create_conversation(title="New conversation"):
def chat(user_message, conversation_id, settings, client_time=None):
memory = load_memory()
type_filter = classify_retrieval_intent(user_message)
context_pieces, sources = retrieve_context(user_message, type_filter=type_filter)
type_filter, folder_excludes = classify_retrieval_intent(user_message)
context_pieces, sources = retrieve_context(
user_message, type_filter=type_filter,
folder_exclude_prefixes=folder_excludes,
)
history = get_conversation_history(conversation_id)
context_parts = []