encoding: per-slide pptx chunking + extract_blocks API; api: recency tiebreak
extract_blocks(filepath) is the new structured-extraction entry point, returning
list[{heading, text, kind}]. chunk_and_embed accepts either str (blind-chunk
back-compat) or list[dict] (one chunk per block, blind-split if oversize, heading
prepended for retrieval context and stored in metadata).
- pptx: one block per slide. Slide title becomes block heading; speaker notes
fold into the body. Image-only decks with title-only slides now produce
heading-only chunks instead of being recorded as extraction failures.
- docx: deliberately single-block (back-compat). Heading-style section detection
was implemented and rolled back: hand-formatted CVs are Normal-styled with
bold-as-heading, and tying chunk boundaries to formatting choices would lock
future-user into preserving those choices forever. Lexical + cross-encoder
retrieval already handles substring matching inside blind-chunked CVs.
- pdf/txt/md: unchanged (single block, blind chunking).
Recency tiebreak in retrieve_context: pull created_at into the SELECT, use it
as secondary sort key in _rerank so memory/journal snapshots prefer the latest
copy among near-duplicate content.
reindex_docx_pptx.py now accepts --ext=pptx,docx... so re-ingest can target a
subset; previous hardcoded delete regex would have wiped both even with a
single-ext target.
This commit is contained in:
+12
-5
@@ -29,7 +29,7 @@ from sentence_transformers import SentenceTransformer
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
|
||||
from encoding import extract_text, chunk_and_embed, write_embeddings_batch, SUPPORTED
|
||||
from encoding import extract_blocks, chunk_and_embed, write_embeddings_batch, SUPPORTED
|
||||
from failures import (
|
||||
record_ingest_failure as _record_failure_sql,
|
||||
resolve_ingest_failure as _resolve_failure_sql,
|
||||
@@ -128,8 +128,11 @@ def ingest_file(filepath: Path, embedder) -> int:
|
||||
return 0
|
||||
if filepath.suffix.lower() not in SUPPORTED:
|
||||
return 0
|
||||
text = extract_text(filepath)
|
||||
if not text.strip():
|
||||
blocks = extract_blocks(filepath)
|
||||
if not blocks or not any(
|
||||
(b.get("text") or "").strip() or (b.get("heading") or "").strip()
|
||||
for b in blocks
|
||||
):
|
||||
record_ingest_failure(filepath, "Text extraction failed or empty")
|
||||
return 0
|
||||
folder_rel = None
|
||||
@@ -138,7 +141,7 @@ def ingest_file(filepath: Path, embedder) -> int:
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
rows = chunk_and_embed(text, filepath.name, embedder,
|
||||
rows = chunk_and_embed(blocks, filepath.name, embedder,
|
||||
filepath=filepath, folder=folder_rel)
|
||||
except Exception as e:
|
||||
log.error(f"Embedding failed for {filepath.name}: {e}")
|
||||
@@ -159,7 +162,11 @@ def ingest_file(filepath: Path, embedder) -> int:
|
||||
return 0
|
||||
log.info(f"Indexed {len(rows)} chunks: {filepath.name}")
|
||||
resolve_ingest_failure(source)
|
||||
enqueue_stage2(source, text)
|
||||
full_text = "\n".join(
|
||||
f"{b['heading']}\n{b['text']}" if b.get("heading") else b.get("text", "")
|
||||
for b in blocks
|
||||
)
|
||||
enqueue_stage2(source, full_text)
|
||||
return len(rows)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user