9955c7e383
extract_blocks(filepath) is the new structured-extraction entry point, returning
list[{heading, text, kind}]. chunk_and_embed accepts either str (blind-chunk
back-compat) or list[dict] (one chunk per block, blind-split if oversize, heading
prepended for retrieval context and stored in metadata).
- pptx: one block per slide. Slide title becomes block heading; speaker notes
fold into the body. Image-only decks with title-only slides now produce
heading-only chunks instead of being recorded as extraction failures.
- docx: deliberately single-block (back-compat). Heading-style section detection
was implemented and rolled back: hand-formatted CVs are Normal-styled with
bold-as-heading, and tying chunk boundaries to formatting choices would lock
future-user into preserving those choices forever. Lexical + cross-encoder
retrieval already handles substring matching inside blind-chunked CVs.
- pdf/txt/md: unchanged (single block, blind chunking).
Recency tiebreak in retrieve_context: pull created_at into the SELECT, use it
as secondary sort key in _rerank so memory/journal snapshots prefer the latest
copy among near-duplicate content.
reindex_docx_pptx.py now accepts --ext=pptx,docx... so re-ingest can target a
subset; previous hardcoded delete regex would have wiped both even with a
single-ext target.
147 lines
4.2 KiB
Python
147 lines
4.2 KiB
Python
"""One-off: re-ingest docx+pptx after the 2026-05-04 extractor upgrade (commit 93c0d89).
|
|
|
|
Pre-upgrade extraction missed tables, headers/footers, text boxes, group shapes,
|
|
and pptx notes — leaving CVs/dossiers as section-header skeletons in the index.
|
|
|
|
Steps when run with --apply:
|
|
1. DELETE all embeddings rows where source ends in .docx or .pptx
|
|
2. Walk NEXTCLOUD_PATH and re-ingest every .docx/.pptx via _ingest_one
|
|
3. Stage 2 enqueue is suppressed (SKIP_STAGE2_ENQUEUE=1)
|
|
|
|
Without --apply: dry-run. Counts files and chunks, prints a sample, writes nothing.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
os.environ["SKIP_STAGE2_ENQUEUE"] = "1"
|
|
|
|
from dotenv import load_dotenv
|
|
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
|
|
|
|
import psycopg2
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from ingest import _ingest_one, get_pg
|
|
|
|
NEXTCLOUD_PATH = Path("/home/aaron/nextcloud/data/data/aaron/files")
|
|
|
|
APPLY = "--apply" in sys.argv
|
|
_ext_args = [a for a in sys.argv[1:] if a.startswith("--ext=")]
|
|
if _ext_args:
|
|
TARGET_EXTS = {("." + e.lstrip(".")) for arg in _ext_args
|
|
for e in arg.split("=", 1)[1].split(",")}
|
|
else:
|
|
TARGET_EXTS = {".docx", ".pptx"}
|
|
|
|
|
|
def _ext_regex():
|
|
inner = "|".join(re.escape(e.lstrip(".")) for e in sorted(TARGET_EXTS))
|
|
return f"\\.({inner})$"
|
|
|
|
|
|
def count_stale():
|
|
pg = get_pg()
|
|
cur = pg.cursor()
|
|
cur.execute(
|
|
f"SELECT lower(substring(source from '\\.[^.]+$')) AS ext, "
|
|
f"COUNT(DISTINCT source) AS files, COUNT(*) AS chunks "
|
|
f"FROM embeddings WHERE lower(source) ~ '{_ext_regex()}' "
|
|
f"GROUP BY 1 ORDER BY 1"
|
|
)
|
|
rows = cur.fetchall()
|
|
pg.close()
|
|
return rows
|
|
|
|
|
|
def delete_stale():
|
|
pg = get_pg()
|
|
cur = pg.cursor()
|
|
cur.execute(f"DELETE FROM embeddings WHERE lower(source) ~ '{_ext_regex()}'")
|
|
deleted = cur.rowcount
|
|
pg.commit()
|
|
pg.close()
|
|
return deleted
|
|
|
|
|
|
def find_files():
|
|
files = []
|
|
for f in NEXTCLOUD_PATH.rglob("*"):
|
|
if not f.is_file():
|
|
continue
|
|
if f.suffix.lower() not in TARGET_EXTS:
|
|
continue
|
|
if f.name.startswith(("~$", ".")):
|
|
continue
|
|
files.append(f)
|
|
return files
|
|
|
|
|
|
def main():
|
|
print(f"Mode: {'APPLY (destructive)' if APPLY else 'DRY-RUN (no writes)'}")
|
|
print(f"Target: {NEXTCLOUD_PATH}")
|
|
print(f"Extensions: {sorted(TARGET_EXTS)}")
|
|
print(f"SKIP_STAGE2_ENQUEUE={os.environ.get('SKIP_STAGE2_ENQUEUE')}")
|
|
print()
|
|
|
|
print("Stale chunks currently in DB:")
|
|
for ext, files, chunks in count_stale():
|
|
print(f" {ext}: {files} files, {chunks} chunks")
|
|
print()
|
|
|
|
files = find_files()
|
|
by_ext = {}
|
|
for f in files:
|
|
by_ext.setdefault(f.suffix.lower(), []).append(f)
|
|
print(f"Files on disk to re-ingest:")
|
|
for ext, lst in sorted(by_ext.items()):
|
|
print(f" {ext}: {len(lst)} files")
|
|
print(f" total: {len(files)}")
|
|
print()
|
|
print("Sample (5 random):")
|
|
import random
|
|
for f in random.sample(files, min(5, len(files))):
|
|
print(f" {f}")
|
|
print()
|
|
|
|
if not APPLY:
|
|
print("Dry-run only. Re-run with --apply to delete + re-ingest.")
|
|
return
|
|
|
|
print("Deleting stale chunks...")
|
|
n = delete_stale()
|
|
print(f" deleted {n} rows")
|
|
print()
|
|
|
|
print("Loading embedder...")
|
|
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
|
print()
|
|
|
|
print(f"Re-ingesting {len(files)} files...")
|
|
started = time.time()
|
|
ingested = failed = total_chunks = 0
|
|
for i, f in enumerate(files, 1):
|
|
n = _ingest_one(f, embedder, root=NEXTCLOUD_PATH)
|
|
if n > 0:
|
|
ingested += 1
|
|
total_chunks += n
|
|
else:
|
|
failed += 1
|
|
if i % 25 == 0 or i == len(files):
|
|
elapsed = time.time() - started
|
|
rate = i / elapsed if elapsed else 0
|
|
print(f" [{i}/{len(files)}] ingested={ingested} failed={failed} "
|
|
f"chunks={total_chunks} ({rate:.1f} files/s)")
|
|
elapsed = time.time() - started
|
|
print()
|
|
print(f"Done in {elapsed:.0f}s: {ingested} ingested, {failed} failed, "
|
|
f"{total_chunks} chunks written.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|