api.py: folder-aware retrieval, near-duplicate dedup, folder in citations

Three refinements to retrieve_context, all keyed off observed failures from
test_retrieval.py:

- Library/personal split. classify_retrieval_intent now returns
  (type_filter, folder_exclude_prefixes). Biographical document intent excludes
  Library/* so philosophy/cognition books stop crowding out CVs and dossiers
  for queries like "write me a bio".

- Near-duplicate collapse. Multi-folder copies of the same file (e.g., several
  Teaching Philosophy.pdf in different application folders) used to fill the
  top-N with the same content. Dedup by first-300-chars hash after rerank.

- Folder in source citations. Surface metadata.folder alongside basename so
  the LLM can disambiguate among 21 CV.docx variants and the user can see
  which copy a citation refers to.

Also: bump hnsw.ef_search to 500 when a WHERE filter is present.
pgvector 0.6 doesn't iterate past its initial HNSW candidate list, so a
restrictive filter that excludes the nearest neighbors otherwise returns
empty.
This commit is contained in:
2026-05-19 21:35:28 +00:00
parent 8d560f9f5e
commit 50b97e2998
2 changed files with 83 additions and 33 deletions
+78 -30
View File
@@ -277,25 +277,32 @@ def _websearch_query(text: str) -> str:
def classify_retrieval_intent(query: str): def classify_retrieval_intent(query: str):
"""Return a list of `type` values to filter retrieval on, or None for all types. """Return (type_filter, folder_exclude_prefixes). Either may be None.
type_filter restricts the candidate pool by `type`; folder_exclude_prefixes
excludes any chunk whose metadata.folder matches a LIKE 'prefix%' pattern.
Implementation is a low-effort keyword classifier — explicitly tunable and Implementation is a low-effort keyword classifier — explicitly tunable and
swappable. For more nuanced routing, replace this with an LLM classifier call swappable. For nuanced routing, replace with an LLM classifier returning
that returns the same shape: a list of valid type strings or None. the same shape.
Precedence: conversation signals win over document signals — a question like Precedence: conversation signals win over document signals — "what did I
"what did I tell you about my CV" is asking about the conversation, not the CV. tell you about my CV" is asking about the conversation, not the CV.
"""
For biographical/document intent, also exclude the reference library
(Library/Foundations/* — philosophy and cognition books), which is
categorically different from personal artifacts but lives in the same
`type='document'` bucket."""
q = query.lower() q = query.lower()
if any(s in q for s in _CONVO_SIGNALS): if any(s in q for s in _CONVO_SIGNALS):
return CONVERSATION_TYPES return (CONVERSATION_TYPES, None)
if any(s in q for s in _DOC_SIGNALS): if any(s in q for s in _DOC_SIGNALS):
return DOCUMENT_TYPES return (DOCUMENT_TYPES, ["Library/"])
return None return (None, None)
def _rerank(query: str, candidates: list[tuple]) -> list[tuple]: def _rerank(query: str, candidates: list[tuple]) -> list[tuple]:
"""Cross-encoder rerank. Candidates are (id, document, source) tuples. """Cross-encoder rerank. Candidates are (id, document, source, folder) tuples.
Returns the same tuples reordered by reranker score (highest first).""" Returns the same tuples reordered by reranker score (highest first)."""
if not candidates: if not candidates:
return [] return []
@@ -305,7 +312,25 @@ def _rerank(query: str, candidates: list[tuple]) -> list[tuple]:
key=lambda x: x[1], reverse=True)] key=lambda x: x[1], reverse=True)]
def retrieve_context(query, n_results=FINAL_LIMIT, type_filter=None): def _format_source(source: str, folder: str) -> str:
"""Surface folder context to the LLM so it can disambiguate same-named files
(e.g., 21 different CV.docx files across job-application folders)."""
source = source or "unknown"
if folder and folder not in ("", "."):
return f"{folder}/{source}"
return source
def _dedup_key(doc: str) -> str:
"""Collapse near-duplicates by content. Files copied to multiple folders
produce byte-identical chunks; this catches those without affecting
legitimately-different chunks of the same source (e.g., separate sections
of a conversation)."""
return hashlib.md5(doc[:300].lower().encode("utf-8", "ignore")).hexdigest()
def retrieve_context(query, n_results=FINAL_LIMIT,
type_filter=None, folder_exclude_prefixes=None):
"""Hybrid retrieval (dense + lexical, RRF fused) followed by cross-encoder rerank. """Hybrid retrieval (dense + lexical, RRF fused) followed by cross-encoder rerank.
- Dense (pgvector) handles paraphrase / semantic similarity. - Dense (pgvector) handles paraphrase / semantic similarity.
@@ -314,48 +339,61 @@ def retrieve_context(query, n_results=FINAL_LIMIT, type_filter=None):
- RRF combines the two rankings without calibrating score scales. - RRF combines the two rankings without calibrating score scales.
- Cross-encoder rerank scores each (query, chunk) pair jointly, bridging - Cross-encoder rerank scores each (query, chunk) pair jointly, bridging
semantic gaps that bi-encoders can't (e.g., "write me a bio" -> CV chunk). semantic gaps that bi-encoders can't (e.g., "write me a bio" -> CV chunk).
- Near-duplicate collapse on output so top-N slots aren't burned by
multi-folder copies of the same file.
type_filter: optional list of `type` values to restrict the candidate pool to. type_filter: optional list of `type` values to restrict the candidate pool to.
If None, retrieves from all types. Use classify_retrieval_intent() to derive.""" folder_exclude_prefixes: optional list of folder LIKE prefixes to exclude.
Both default to None (no restriction). Use classify_retrieval_intent() to derive."""
query_embedding = embedder.encode([query]).tolist()[0] query_embedding = embedder.encode([query]).tolist()[0]
ts_query = _websearch_query(query) ts_query = _websearch_query(query)
context_pieces = [] context_pieces = []
sources = [] sources = []
where_sql = "" where_clauses = []
type_param = () extra_params = []
if type_filter: if type_filter:
where_sql = "WHERE type = ANY(%s)" where_clauses.append("type = ANY(%s)")
type_param = (list(type_filter),) extra_params.append(list(type_filter))
for prefix in (folder_exclude_prefixes or []):
where_clauses.append("(metadata->>'folder' IS NULL OR metadata->>'folder' NOT LIKE %s)")
extra_params.append(prefix + "%")
common_where = ("WHERE " + " AND ".join(where_clauses)) if where_clauses else ""
try: try:
pg = get_pg() pg = get_pg()
cur = pg.cursor() cur = pg.cursor()
# pgvector 0.6 HNSW doesn't iterate past its initial candidate list when
# a restrictive WHERE filter is present — so a filter that excludes the
# top-N nearest leaves nothing. Bumping ef_search forces the index to
# explore more graph nodes. Cheap when unfiltered; load-bearing when filtered.
if where_clauses:
cur.execute("SET LOCAL hnsw.ef_search = 500")
cur.execute(f""" cur.execute(f"""
SELECT id, document, source SELECT id, document, source, metadata->>'folder' AS folder
FROM embeddings FROM embeddings
{where_sql} {common_where}
ORDER BY embedding <=> %s::vector ORDER BY embedding <=> %s::vector
LIMIT %s LIMIT %s
""", (*type_param, query_embedding, HYBRID_CANDIDATES)) """, (*extra_params, query_embedding, HYBRID_CANDIDATES))
dense_hits = cur.fetchall() dense_hits = cur.fetchall()
lexical_hits = [] lexical_hits = []
if ts_query: if ts_query:
lex_where = "to_tsvector('english', document) @@ websearch_to_tsquery('english', %s)" lex_match = "to_tsvector('english', document) @@ websearch_to_tsquery('english', %s)"
full_where = (f"WHERE {lex_where} AND type = ANY(%s)" lex_where = ("WHERE " + " AND ".join([lex_match] + where_clauses))
if type_filter else f"WHERE {lex_where}")
lex_params = ((ts_query, list(type_filter)) if type_filter else (ts_query,))
cur.execute(f""" cur.execute(f"""
SELECT id, document, source SELECT id, document, source, metadata->>'folder' AS folder
FROM embeddings FROM embeddings
{full_where} {lex_where}
ORDER BY ts_rank(to_tsvector('english', document), ORDER BY ts_rank(to_tsvector('english', document),
websearch_to_tsquery('english', %s)) DESC websearch_to_tsquery('english', %s)) DESC
LIMIT %s LIMIT %s
""", (*lex_params, ts_query, HYBRID_CANDIDATES)) """, (ts_query, *extra_params, ts_query, HYBRID_CANDIDATES))
lexical_hits = cur.fetchall() lexical_hits = cur.fetchall()
pg.close() pg.close()
@@ -372,9 +410,16 @@ def retrieve_context(query, n_results=FINAL_LIMIT, type_filter=None):
rrf_ranked = sorted(scores.items(), key=lambda kv: kv[1], reverse=True) rrf_ranked = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
candidates = [rows_by_id[doc_id] for doc_id, _ in rrf_ranked] candidates = [rows_by_id[doc_id] for doc_id, _ in rrf_ranked]
for _id, doc, source in _rerank(query, candidates)[:n_results]: seen = set()
for _id, doc, source, folder in _rerank(query, candidates):
key = _dedup_key(doc)
if key in seen:
continue
seen.add(key)
context_pieces.append(doc) context_pieces.append(doc)
sources.append(source or "unknown") sources.append(_format_source(source, folder))
if len(context_pieces) >= n_results:
break
except Exception as e: except Exception as e:
print(f"hybrid retrieval error: {e}") print(f"hybrid retrieval error: {e}")
@@ -418,8 +463,11 @@ def create_conversation(title="New conversation"):
def chat(user_message, conversation_id, settings, client_time=None): def chat(user_message, conversation_id, settings, client_time=None):
memory = load_memory() memory = load_memory()
type_filter = classify_retrieval_intent(user_message) type_filter, folder_excludes = classify_retrieval_intent(user_message)
context_pieces, sources = retrieve_context(user_message, type_filter=type_filter) context_pieces, sources = retrieve_context(
user_message, type_filter=type_filter,
folder_exclude_prefixes=folder_excludes,
)
history = get_conversation_history(conversation_id) history = get_conversation_history(conversation_id)
context_parts = [] context_parts = []
+5 -3
View File
@@ -50,9 +50,11 @@ QUERIES = [
] ]
for q in QUERIES: for q in QUERIES:
intent = classify_retrieval_intent(q) type_filter, folder_excludes = classify_retrieval_intent(q)
pieces, sources = retrieve_context(q, type_filter=intent) pieces, sources = retrieve_context(
q, type_filter=type_filter, folder_exclude_prefixes=folder_excludes,
)
print(f"\n=== {q!r} ===") print(f"\n=== {q!r} ===")
print(f" intent: {intent}") print(f" type_filter: {type_filter} folder_excludes: {folder_excludes}")
for i, src in enumerate(sources, 1): for i, src in enumerate(sources, 1):
print(f" {i}. {src}") print(f" {i}. {src}")