encoding: per-slide pptx chunking + extract_blocks API; api: recency tiebreak
extract_blocks(filepath) is the new structured-extraction entry point, returning
list[{heading, text, kind}]. chunk_and_embed accepts either str (blind-chunk
back-compat) or list[dict] (one chunk per block, blind-split if oversize, heading
prepended for retrieval context and stored in metadata).
- pptx: one block per slide. Slide title becomes block heading; speaker notes
fold into the body. Image-only decks with title-only slides now produce
heading-only chunks instead of being recorded as extraction failures.
- docx: deliberately single-block (back-compat). Heading-style section detection
was implemented and rolled back: hand-formatted CVs are Normal-styled with
bold-as-heading, and tying chunk boundaries to formatting choices would lock
future-user into preserving those choices forever. Lexical + cross-encoder
retrieval already handles substring matching inside blind-chunked CVs.
- pdf/txt/md: unchanged (single block, blind chunking).
Recency tiebreak in retrieve_context: pull created_at into the SELECT, use it
as secondary sort key in _rerank so memory/journal snapshots prefer the latest
copy among near-duplicate content.
reindex_docx_pptx.py now accepts --ext=pptx,docx... so re-ingest can target a
subset; previous hardcoded delete regex would have wiped both even with a
single-ext target.
This commit is contained in:
+12
-7
@@ -302,14 +302,19 @@ def classify_retrieval_intent(query: str):
|
||||
|
||||
|
||||
def _rerank(query: str, candidates: list[tuple]) -> list[tuple]:
|
||||
"""Cross-encoder rerank. Candidates are (id, document, source, folder) tuples.
|
||||
Returns the same tuples reordered by reranker score (highest first)."""
|
||||
"""Cross-encoder rerank. Candidates are (id, document, source, folder, created_at)
|
||||
tuples. Returns the same tuples reordered by reranker score with created_at as
|
||||
secondary key — so when two chunks score similarly the newer one wins, which
|
||||
keeps memory/journal files biased toward the latest snapshot."""
|
||||
if not candidates:
|
||||
return []
|
||||
pairs = [(query, row[1]) for row in candidates]
|
||||
scores = reranker.predict(pairs)
|
||||
return [row for row, _ in sorted(zip(candidates, scores),
|
||||
key=lambda x: x[1], reverse=True)]
|
||||
return [row for row, _ in sorted(
|
||||
zip(candidates, scores),
|
||||
key=lambda x: (float(x[1]), x[0][4] or ""),
|
||||
reverse=True,
|
||||
)]
|
||||
|
||||
|
||||
def _format_source(source: str, folder: str) -> str:
|
||||
@@ -374,7 +379,7 @@ def retrieve_context(query, n_results=FINAL_LIMIT,
|
||||
cur.execute("SET LOCAL hnsw.ef_search = 500")
|
||||
|
||||
cur.execute(f"""
|
||||
SELECT id, document, source, metadata->>'folder' AS folder
|
||||
SELECT id, document, source, metadata->>'folder' AS folder, created_at
|
||||
FROM embeddings
|
||||
{common_where}
|
||||
ORDER BY embedding <=> %s::vector
|
||||
@@ -387,7 +392,7 @@ def retrieve_context(query, n_results=FINAL_LIMIT,
|
||||
lex_match = "to_tsvector('english', document) @@ websearch_to_tsquery('english', %s)"
|
||||
lex_where = ("WHERE " + " AND ".join([lex_match] + where_clauses))
|
||||
cur.execute(f"""
|
||||
SELECT id, document, source, metadata->>'folder' AS folder
|
||||
SELECT id, document, source, metadata->>'folder' AS folder, created_at
|
||||
FROM embeddings
|
||||
{lex_where}
|
||||
ORDER BY ts_rank(to_tsvector('english', document),
|
||||
@@ -411,7 +416,7 @@ def retrieve_context(query, n_results=FINAL_LIMIT,
|
||||
candidates = [rows_by_id[doc_id] for doc_id, _ in rrf_ranked]
|
||||
|
||||
seen = set()
|
||||
for _id, doc, source, folder in _rerank(query, candidates):
|
||||
for _id, doc, source, folder, _created_at in _rerank(query, candidates):
|
||||
key = _dedup_key(doc)
|
||||
if key in seen:
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user