api.py: hybrid retrieval with intent routing and cross-encoder rerank

Replaces pure-dense top-8 retrieval with a three-stage pipeline: - BM25 (tsvector + websearch_to_tsquery) and dense (pgvector) in parallel, fused with Reciprocal Rank Fusion - Optional type filter driven by classify_retrieval_intent() so questions about prior conversations don't pull documents and vice versa - Cross-encoder rerank (ms-marco-MiniLM-L-6-v2) over RRF candidates before taking final top-N Also adds scripts/reindex_docx_pptx.py — one-off re-ingest used to recover table/header/text-box content in docx and pptx after the 93c0d89 extractor upgrade — and scripts/test_retrieval.py to exercise the new pipeline against representative queries. Schema: requires GIN index on to_tsvector('english', document) (already created out-of-band via psql since Apache AGE in shared_preload_libraries blocks ALTER TABLE on this database).
2026-05-19 21:11:15 +00:00
parent 732e450d21
commit 8d560f9f5e
4 changed files with 322 additions and 15 deletions
@@ -0,0 +1,135 @@
+"""One-off: re-ingest docx+pptx after the 2026-05-04 extractor upgrade (commit 93c0d89).
+
+Pre-upgrade extraction missed tables, headers/footers, text boxes, group shapes,
+and pptx notes — leaving CVs/dossiers as section-header skeletons in the index.
+
+Steps when run with --apply:
+  1. DELETE all embeddings rows where source ends in .docx or .pptx
+  2. Walk NEXTCLOUD_PATH and re-ingest every .docx/.pptx via _ingest_one
+  3. Stage 2 enqueue is suppressed (SKIP_STAGE2_ENQUEUE=1)
+
+Without --apply: dry-run. Counts files and chunks, prints a sample, writes nothing.
+"""
+
+import os
+import sys
+import time
+from pathlib import Path
+
+os.environ["SKIP_STAGE2_ENQUEUE"] = "1"
+
+from dotenv import load_dotenv
+load_dotenv(Path.home() / "aaronai" / ".env", override=True)
+
+import psycopg2
+from sentence_transformers import SentenceTransformer
+
+sys.path.insert(0, str(Path(__file__).parent))
+from ingest import _ingest_one, get_pg
+
+NEXTCLOUD_PATH = Path("/home/aaron/nextcloud/data/data/aaron/files")
+TARGET_EXTS = {".docx", ".pptx"}
+
+APPLY = "--apply" in sys.argv
+
+
+def count_stale():
+    pg = get_pg()
+    cur = pg.cursor()
+    cur.execute(
+        "SELECT lower(substring(source from '\\.[^.]+$')) AS ext, "
+        "COUNT(DISTINCT source) AS files, COUNT(*) AS chunks "
+        "FROM embeddings WHERE lower(source) ~ '\\.(docx|pptx)$' "
+        "GROUP BY 1 ORDER BY 1"
+    )
+    rows = cur.fetchall()
+    pg.close()
+    return rows
+
+
+def delete_stale():
+    pg = get_pg()
+    cur = pg.cursor()
+    cur.execute("DELETE FROM embeddings WHERE lower(source) ~ '\\.(docx|pptx)$'")
+    deleted = cur.rowcount
+    pg.commit()
+    pg.close()
+    return deleted
+
+
+def find_files():
+    files = []
+    for f in NEXTCLOUD_PATH.rglob("*"):
+        if not f.is_file():
+            continue
+        if f.suffix.lower() not in TARGET_EXTS:
+            continue
+        if f.name.startswith(("~$", ".")):
+            continue
+        files.append(f)
+    return files
+
+
+def main():
+    print(f"Mode: {'APPLY (destructive)' if APPLY else 'DRY-RUN (no writes)'}")
+    print(f"Target: {NEXTCLOUD_PATH}")
+    print(f"Extensions: {sorted(TARGET_EXTS)}")
+    print(f"SKIP_STAGE2_ENQUEUE={os.environ.get('SKIP_STAGE2_ENQUEUE')}")
+    print()
+
+    print("Stale chunks currently in DB:")
+    for ext, files, chunks in count_stale():
+        print(f"  {ext}: {files} files, {chunks} chunks")
+    print()
+
+    files = find_files()
+    by_ext = {}
+    for f in files:
+        by_ext.setdefault(f.suffix.lower(), []).append(f)
+    print(f"Files on disk to re-ingest:")
+    for ext, lst in sorted(by_ext.items()):
+        print(f"  {ext}: {len(lst)} files")
+    print(f"  total: {len(files)}")
+    print()
+    print("Sample (5 random):")
+    import random
+    for f in random.sample(files, min(5, len(files))):
+        print(f"  {f}")
+    print()
+
+    if not APPLY:
+        print("Dry-run only. Re-run with --apply to delete + re-ingest.")
+        return
+
+    print("Deleting stale chunks...")
+    n = delete_stale()
+    print(f"  deleted {n} rows")
+    print()
+
+    print("Loading embedder...")
+    embedder = SentenceTransformer("all-MiniLM-L6-v2")
+    print()
+
+    print(f"Re-ingesting {len(files)} files...")
+    started = time.time()
+    ingested = failed = total_chunks = 0
+    for i, f in enumerate(files, 1):
+        n = _ingest_one(f, embedder, root=NEXTCLOUD_PATH)
+        if n > 0:
+            ingested += 1
+            total_chunks += n
+        else:
+            failed += 1
+        if i % 25 == 0 or i == len(files):
+            elapsed = time.time() - started
+            rate = i / elapsed if elapsed else 0
+            print(f"  [{i}/{len(files)}] ingested={ingested} failed={failed} "
+                  f"chunks={total_chunks} ({rate:.1f} files/s)")
+    elapsed = time.time() - started
+    print()
+    print(f"Done in {elapsed:.0f}s: {ingested} ingested, {failed} failed, "
+          f"{total_chunks} chunks written.")
+
+
+if __name__ == "__main__":
+    main()