api.py: hybrid retrieval with intent routing and cross-encoder rerank

Replaces pure-dense top-8 retrieval with a three-stage pipeline:
- BM25 (tsvector + websearch_to_tsquery) and dense (pgvector) in parallel,
  fused with Reciprocal Rank Fusion
- Optional type filter driven by classify_retrieval_intent() so questions
  about prior conversations don't pull documents and vice versa
- Cross-encoder rerank (ms-marco-MiniLM-L-6-v2) over RRF candidates before
  taking final top-N

Also adds scripts/reindex_docx_pptx.py — one-off re-ingest used to recover
table/header/text-box content in docx and pptx after the 93c0d89 extractor
upgrade — and scripts/test_retrieval.py to exercise the new pipeline against
representative queries.

Schema: requires GIN index on to_tsvector('english', document) (already
created out-of-band via psql since Apache AGE in shared_preload_libraries
blocks ALTER TABLE on this database).
This commit is contained in:
2026-05-19 21:11:15 +00:00
parent 732e450d21
commit 8d560f9f5e
4 changed files with 322 additions and 15 deletions
+58
View File
@@ -0,0 +1,58 @@
"""End-to-end test of retrieve_context with intent routing + reranking.
Avoids loading the full FastAPI app; replicates the chat-handler retrieval
call shape and prints classifier output + final ranked sources for each query.
"""
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
sys.path.insert(0, str(Path(__file__).parent))
# Stub anthropic so api.py import doesn't fail without the SDK loaded.
# We only need retrieve_context + classify_retrieval_intent.
import types
sys.modules.setdefault("anthropic", types.ModuleType("anthropic"))
sys.modules["anthropic"].Anthropic = lambda **kw: None
# Same for whisper if present
if "faster_whisper" not in sys.modules:
sys.modules["faster_whisper"] = types.ModuleType("faster_whisper")
import importlib.util
spec = importlib.util.spec_from_file_location("api", Path(__file__).parent / "api.py")
api = importlib.util.module_from_spec(spec)
# Don't execute the whole module (it starts FastAPI). Instead, exec only definitions.
# Easier: just import the functions we need by exec'ing the file but catching errors.
try:
spec.loader.exec_module(api)
except Exception as e:
print(f"(continuing despite api.py side-effect error: {e})")
retrieve_context = api.retrieve_context
classify_retrieval_intent = api.classify_retrieval_intent
QUERIES = [
"write me a bio",
"my professional bio",
"draft a bio for the Utah application",
"Aaron Nelson CV consulting and design work",
"FWN3D consulting",
"syllabi I have taught",
"philosophy of teaching",
"what did I tell Claude about FWN3D",
"what did we discuss about the Utah job",
"Hudson Valley Additive Manufacturing Center",
]
for q in QUERIES:
intent = classify_retrieval_intent(q)
pieces, sources = retrieve_context(q, type_filter=intent)
print(f"\n=== {q!r} ===")
print(f" intent: {intent}")
for i, src in enumerate(sources, 1):
print(f" {i}. {src}")