aaronAI/scripts/reindex_docx_pptx.py

"""One-off: re-ingest docx+pptx after the 2026-05-04 extractor upgrade (commit 93c0d89).

Pre-upgrade extraction missed tables, headers/footers, text boxes, group shapes,
and pptx notes — leaving CVs/dossiers as section-header skeletons in the index.

Steps when run with --apply:
  1. DELETE all embeddings rows where source ends in .docx or .pptx
  2. Walk NEXTCLOUD_PATH and re-ingest every .docx/.pptx via _ingest_one
  3. Stage 2 enqueue is suppressed (SKIP_STAGE2_ENQUEUE=1)

Without --apply: dry-run. Counts files and chunks, prints a sample, writes nothing.
"""

import os
import sys
import time
from pathlib import Path

os.environ["SKIP_STAGE2_ENQUEUE"] = "1"

from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env", override=True)

import psycopg2
from sentence_transformers import SentenceTransformer

sys.path.insert(0, str(Path(__file__).parent))
from ingest import _ingest_one, get_pg

NEXTCLOUD_PATH = Path("/home/aaron/nextcloud/data/data/aaron/files")
TARGET_EXTS = {".docx", ".pptx"}

APPLY = "--apply" in sys.argv


def count_stale():
    pg = get_pg()
    cur = pg.cursor()
    cur.execute(
        "SELECT lower(substring(source from '\\.[^.]+$')) AS ext, "
        "COUNT(DISTINCT source) AS files, COUNT(*) AS chunks "
        "FROM embeddings WHERE lower(source) ~ '\\.(docx|pptx)$' "
        "GROUP BY 1 ORDER BY 1"
    )
    rows = cur.fetchall()
    pg.close()
    return rows


def delete_stale():
    pg = get_pg()
    cur = pg.cursor()
    cur.execute("DELETE FROM embeddings WHERE lower(source) ~ '\\.(docx|pptx)$'")
    deleted = cur.rowcount
    pg.commit()
    pg.close()
    return deleted


def find_files():
    files = []
    for f in NEXTCLOUD_PATH.rglob("*"):
        if not f.is_file():
            continue
        if f.suffix.lower() not in TARGET_EXTS:
            continue
        if f.name.startswith(("~$", ".")):
            continue
        files.append(f)
    return files


def main():
    print(f"Mode: {'APPLY (destructive)' if APPLY else 'DRY-RUN (no writes)'}")
    print(f"Target: {NEXTCLOUD_PATH}")
    print(f"Extensions: {sorted(TARGET_EXTS)}")
    print(f"SKIP_STAGE2_ENQUEUE={os.environ.get('SKIP_STAGE2_ENQUEUE')}")
    print()

    print("Stale chunks currently in DB:")
    for ext, files, chunks in count_stale():
        print(f"  {ext}: {files} files, {chunks} chunks")
    print()

    files = find_files()
    by_ext = {}
    for f in files:
        by_ext.setdefault(f.suffix.lower(), []).append(f)
    print(f"Files on disk to re-ingest:")
    for ext, lst in sorted(by_ext.items()):
        print(f"  {ext}: {len(lst)} files")
    print(f"  total: {len(files)}")
    print()
    print("Sample (5 random):")
    import random
    for f in random.sample(files, min(5, len(files))):
        print(f"  {f}")
    print()

    if not APPLY:
        print("Dry-run only. Re-run with --apply to delete + re-ingest.")
        return

    print("Deleting stale chunks...")
    n = delete_stale()
    print(f"  deleted {n} rows")
    print()

    print("Loading embedder...")
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    print()

    print(f"Re-ingesting {len(files)} files...")
    started = time.time()
    ingested = failed = total_chunks = 0
    for i, f in enumerate(files, 1):
        n = _ingest_one(f, embedder, root=NEXTCLOUD_PATH)
        if n > 0:
            ingested += 1
            total_chunks += n
        else:
            failed += 1
        if i % 25 == 0 or i == len(files):
            elapsed = time.time() - started
            rate = i / elapsed if elapsed else 0
            print(f"  [{i}/{len(files)}] ingested={ingested} failed={failed} "
                  f"chunks={total_chunks} ({rate:.1f} files/s)")
    elapsed = time.time() - started
    print()
    print(f"Done in {elapsed:.0f}s: {ingested} ingested, {failed} failed, "
          f"{total_chunks} chunks written.")


if __name__ == "__main__":
    main()