"""One-off: re-ingest docx+pptx after the 2026-05-04 extractor upgrade (commit 93c0d89). Pre-upgrade extraction missed tables, headers/footers, text boxes, group shapes, and pptx notes — leaving CVs/dossiers as section-header skeletons in the index. Steps when run with --apply: 1. DELETE all embeddings rows where source ends in .docx or .pptx 2. Walk NEXTCLOUD_PATH and re-ingest every .docx/.pptx via _ingest_one 3. Stage 2 enqueue is suppressed (SKIP_STAGE2_ENQUEUE=1) Without --apply: dry-run. Counts files and chunks, prints a sample, writes nothing. """ import os import sys import time from pathlib import Path os.environ["SKIP_STAGE2_ENQUEUE"] = "1" from dotenv import load_dotenv load_dotenv(Path.home() / "aaronai" / ".env", override=True) import psycopg2 from sentence_transformers import SentenceTransformer sys.path.insert(0, str(Path(__file__).parent)) from ingest import _ingest_one, get_pg NEXTCLOUD_PATH = Path("/home/aaron/nextcloud/data/data/aaron/files") TARGET_EXTS = {".docx", ".pptx"} APPLY = "--apply" in sys.argv def count_stale(): pg = get_pg() cur = pg.cursor() cur.execute( "SELECT lower(substring(source from '\\.[^.]+$')) AS ext, " "COUNT(DISTINCT source) AS files, COUNT(*) AS chunks " "FROM embeddings WHERE lower(source) ~ '\\.(docx|pptx)$' " "GROUP BY 1 ORDER BY 1" ) rows = cur.fetchall() pg.close() return rows def delete_stale(): pg = get_pg() cur = pg.cursor() cur.execute("DELETE FROM embeddings WHERE lower(source) ~ '\\.(docx|pptx)$'") deleted = cur.rowcount pg.commit() pg.close() return deleted def find_files(): files = [] for f in NEXTCLOUD_PATH.rglob("*"): if not f.is_file(): continue if f.suffix.lower() not in TARGET_EXTS: continue if f.name.startswith(("~$", ".")): continue files.append(f) return files def main(): print(f"Mode: {'APPLY (destructive)' if APPLY else 'DRY-RUN (no writes)'}") print(f"Target: {NEXTCLOUD_PATH}") print(f"Extensions: {sorted(TARGET_EXTS)}") print(f"SKIP_STAGE2_ENQUEUE={os.environ.get('SKIP_STAGE2_ENQUEUE')}") print() print("Stale chunks currently in DB:") for ext, files, chunks in count_stale(): print(f" {ext}: {files} files, {chunks} chunks") print() files = find_files() by_ext = {} for f in files: by_ext.setdefault(f.suffix.lower(), []).append(f) print(f"Files on disk to re-ingest:") for ext, lst in sorted(by_ext.items()): print(f" {ext}: {len(lst)} files") print(f" total: {len(files)}") print() print("Sample (5 random):") import random for f in random.sample(files, min(5, len(files))): print(f" {f}") print() if not APPLY: print("Dry-run only. Re-run with --apply to delete + re-ingest.") return print("Deleting stale chunks...") n = delete_stale() print(f" deleted {n} rows") print() print("Loading embedder...") embedder = SentenceTransformer("all-MiniLM-L6-v2") print() print(f"Re-ingesting {len(files)} files...") started = time.time() ingested = failed = total_chunks = 0 for i, f in enumerate(files, 1): n = _ingest_one(f, embedder, root=NEXTCLOUD_PATH) if n > 0: ingested += 1 total_chunks += n else: failed += 1 if i % 25 == 0 or i == len(files): elapsed = time.time() - started rate = i / elapsed if elapsed else 0 print(f" [{i}/{len(files)}] ingested={ingested} failed={failed} " f"chunks={total_chunks} ({rate:.1f} files/s)") elapsed = time.time() - started print() print(f"Done in {elapsed:.0f}s: {ingested} ingested, {failed} failed, " f"{total_chunks} chunks written.") if __name__ == "__main__": main()