encoding: per-slide pptx chunking + extract_blocks API; api: recency tiebreak

extract_blocks(filepath) is the new structured-extraction entry point, returning list[{heading, text, kind}]. chunk_and_embed accepts either str (blind-chunk back-compat) or list[dict] (one chunk per block, blind-split if oversize, heading prepended for retrieval context and stored in metadata). - pptx: one block per slide. Slide title becomes block heading; speaker notes fold into the body. Image-only decks with title-only slides now produce heading-only chunks instead of being recorded as extraction failures. - docx: deliberately single-block (back-compat). Heading-style section detection was implemented and rolled back: hand-formatted CVs are Normal-styled with bold-as-heading, and tying chunk boundaries to formatting choices would lock future-user into preserving those choices forever. Lexical + cross-encoder retrieval already handles substring matching inside blind-chunked CVs. - pdf/txt/md: unchanged (single block, blind chunking). Recency tiebreak in retrieve_context: pull created_at into the SELECT, use it as secondary sort key in _rerank so memory/journal snapshots prefer the latest copy among near-duplicate content. reindex_docx_pptx.py now accepts --ext=pptx,docx... so re-ingest can target a subset; previous hardcoded delete regex would have wiped both even with a single-ext target.
2026-05-19 21:58:25 +00:00
parent 50b97e2998
commit 9955c7e383
5 changed files with 187 additions and 69 deletions
@@ -302,14 +302,19 @@ def classify_retrieval_intent(query: str):


 def _rerank(query: str, candidates: list[tuple]) -> list[tuple]:
-    """Cross-encoder rerank. Candidates are (id, document, source, folder) tuples.
-    Returns the same tuples reordered by reranker score (highest first)."""
+    """Cross-encoder rerank. Candidates are (id, document, source, folder, created_at)
+    tuples. Returns the same tuples reordered by reranker score with created_at as
+    secondary key — so when two chunks score similarly the newer one wins, which
+    keeps memory/journal files biased toward the latest snapshot."""
    if not candidates:
        return []
    pairs = [(query, row[1]) for row in candidates]
    scores = reranker.predict(pairs)
-    return [row for row, _ in sorted(zip(candidates, scores),
-                                     key=lambda x: x[1], reverse=True)]
+    return [row for row, _ in sorted(
+        zip(candidates, scores),
+        key=lambda x: (float(x[1]), x[0][4] or ""),
+        reverse=True,
+    )]


 def _format_source(source: str, folder: str) -> str:
@@ -374,7 +379,7 @@ def retrieve_context(query, n_results=FINAL_LIMIT,
            cur.execute("SET LOCAL hnsw.ef_search = 500")

        cur.execute(f"""
-            SELECT id, document, source, metadata->>'folder' AS folder
+            SELECT id, document, source, metadata->>'folder' AS folder, created_at
            FROM embeddings
            {common_where}
            ORDER BY embedding <=> %s::vector
@@ -387,7 +392,7 @@ def retrieve_context(query, n_results=FINAL_LIMIT,
            lex_match = "to_tsvector('english', document) @@ websearch_to_tsquery('english', %s)"
            lex_where = ("WHERE " + " AND ".join([lex_match] + where_clauses))
            cur.execute(f"""
-                SELECT id, document, source, metadata->>'folder' AS folder
+                SELECT id, document, source, metadata->>'folder' AS folder, created_at
                FROM embeddings
                {lex_where}
                ORDER BY ts_rank(to_tsvector('english', document),
@@ -411,7 +416,7 @@ def retrieve_context(query, n_results=FINAL_LIMIT,
        candidates = [rows_by_id[doc_id] for doc_id, _ in rrf_ranked]

        seen = set()
-        for _id, doc, source, folder in _rerank(query, candidates):
+        for _id, doc, source, folder, _created_at in _rerank(query, candidates):
            key = _dedup_key(doc)
            if key in seen:
                continue