encoding: per-slide pptx chunking + extract_blocks API; api: recency tiebreak
extract_blocks(filepath) is the new structured-extraction entry point, returning
list[{heading, text, kind}]. chunk_and_embed accepts either str (blind-chunk
back-compat) or list[dict] (one chunk per block, blind-split if oversize, heading
prepended for retrieval context and stored in metadata).
- pptx: one block per slide. Slide title becomes block heading; speaker notes
fold into the body. Image-only decks with title-only slides now produce
heading-only chunks instead of being recorded as extraction failures.
- docx: deliberately single-block (back-compat). Heading-style section detection
was implemented and rolled back: hand-formatted CVs are Normal-styled with
bold-as-heading, and tying chunk boundaries to formatting choices would lock
future-user into preserving those choices forever. Lexical + cross-encoder
retrieval already handles substring matching inside blind-chunked CVs.
- pdf/txt/md: unchanged (single block, blind chunking).
Recency tiebreak in retrieve_context: pull created_at into the SELECT, use it
as secondary sort key in _rerank so memory/journal snapshots prefer the latest
copy among near-duplicate content.
reindex_docx_pptx.py now accepts --ext=pptx,docx... so re-ingest can target a
subset; previous hardcoded delete regex would have wiped both even with a
single-ext target.
This commit is contained in:
@@ -12,6 +12,7 @@ Without --apply: dry-run. Counts files and chunks, prints a sample, writes nothi
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
@@ -28,19 +29,29 @@ sys.path.insert(0, str(Path(__file__).parent))
|
||||
from ingest import _ingest_one, get_pg
|
||||
|
||||
NEXTCLOUD_PATH = Path("/home/aaron/nextcloud/data/data/aaron/files")
|
||||
TARGET_EXTS = {".docx", ".pptx"}
|
||||
|
||||
APPLY = "--apply" in sys.argv
|
||||
_ext_args = [a for a in sys.argv[1:] if a.startswith("--ext=")]
|
||||
if _ext_args:
|
||||
TARGET_EXTS = {("." + e.lstrip(".")) for arg in _ext_args
|
||||
for e in arg.split("=", 1)[1].split(",")}
|
||||
else:
|
||||
TARGET_EXTS = {".docx", ".pptx"}
|
||||
|
||||
|
||||
def _ext_regex():
|
||||
inner = "|".join(re.escape(e.lstrip(".")) for e in sorted(TARGET_EXTS))
|
||||
return f"\\.({inner})$"
|
||||
|
||||
|
||||
def count_stale():
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute(
|
||||
"SELECT lower(substring(source from '\\.[^.]+$')) AS ext, "
|
||||
"COUNT(DISTINCT source) AS files, COUNT(*) AS chunks "
|
||||
"FROM embeddings WHERE lower(source) ~ '\\.(docx|pptx)$' "
|
||||
"GROUP BY 1 ORDER BY 1"
|
||||
f"SELECT lower(substring(source from '\\.[^.]+$')) AS ext, "
|
||||
f"COUNT(DISTINCT source) AS files, COUNT(*) AS chunks "
|
||||
f"FROM embeddings WHERE lower(source) ~ '{_ext_regex()}' "
|
||||
f"GROUP BY 1 ORDER BY 1"
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
pg.close()
|
||||
@@ -50,7 +61,7 @@ def count_stale():
|
||||
def delete_stale():
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute("DELETE FROM embeddings WHERE lower(source) ~ '\\.(docx|pptx)$'")
|
||||
cur.execute(f"DELETE FROM embeddings WHERE lower(source) ~ '{_ext_regex()}'")
|
||||
deleted = cur.rowcount
|
||||
pg.commit()
|
||||
pg.close()
|
||||
|
||||
Reference in New Issue
Block a user