8d560f9f5e
Replaces pure-dense top-8 retrieval with a three-stage pipeline:
- BM25 (tsvector + websearch_to_tsquery) and dense (pgvector) in parallel,
fused with Reciprocal Rank Fusion
- Optional type filter driven by classify_retrieval_intent() so questions
about prior conversations don't pull documents and vice versa
- Cross-encoder rerank (ms-marco-MiniLM-L-6-v2) over RRF candidates before
taking final top-N
Also adds scripts/reindex_docx_pptx.py — one-off re-ingest used to recover
table/header/text-box content in docx and pptx after the 93c0d89 extractor
upgrade — and scripts/test_retrieval.py to exercise the new pipeline against
representative queries.
Schema: requires GIN index on to_tsvector('english', document) (already
created out-of-band via psql since Apache AGE in shared_preload_libraries
blocks ALTER TABLE on this database).
59 lines
2.0 KiB
Python
59 lines
2.0 KiB
Python
"""End-to-end test of retrieve_context with intent routing + reranking.
|
|
|
|
Avoids loading the full FastAPI app; replicates the chat-handler retrieval
|
|
call shape and prints classifier output + final ranked sources for each query.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from dotenv import load_dotenv
|
|
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
# Stub anthropic so api.py import doesn't fail without the SDK loaded.
|
|
# We only need retrieve_context + classify_retrieval_intent.
|
|
import types
|
|
sys.modules.setdefault("anthropic", types.ModuleType("anthropic"))
|
|
sys.modules["anthropic"].Anthropic = lambda **kw: None
|
|
|
|
# Same for whisper if present
|
|
if "faster_whisper" not in sys.modules:
|
|
sys.modules["faster_whisper"] = types.ModuleType("faster_whisper")
|
|
|
|
import importlib.util
|
|
spec = importlib.util.spec_from_file_location("api", Path(__file__).parent / "api.py")
|
|
api = importlib.util.module_from_spec(spec)
|
|
# Don't execute the whole module (it starts FastAPI). Instead, exec only definitions.
|
|
# Easier: just import the functions we need by exec'ing the file but catching errors.
|
|
try:
|
|
spec.loader.exec_module(api)
|
|
except Exception as e:
|
|
print(f"(continuing despite api.py side-effect error: {e})")
|
|
|
|
retrieve_context = api.retrieve_context
|
|
classify_retrieval_intent = api.classify_retrieval_intent
|
|
|
|
QUERIES = [
|
|
"write me a bio",
|
|
"my professional bio",
|
|
"draft a bio for the Utah application",
|
|
"Aaron Nelson CV consulting and design work",
|
|
"FWN3D consulting",
|
|
"syllabi I have taught",
|
|
"philosophy of teaching",
|
|
"what did I tell Claude about FWN3D",
|
|
"what did we discuss about the Utah job",
|
|
"Hudson Valley Additive Manufacturing Center",
|
|
]
|
|
|
|
for q in QUERIES:
|
|
intent = classify_retrieval_intent(q)
|
|
pieces, sources = retrieve_context(q, type_filter=intent)
|
|
print(f"\n=== {q!r} ===")
|
|
print(f" intent: {intent}")
|
|
for i, src in enumerate(sources, 1):
|
|
print(f" {i}. {src}")
|