diff --git a/scripts/api.py b/scripts/api.py index 9073679..e1f25cd 100644 --- a/scripts/api.py +++ b/scripts/api.py @@ -123,11 +123,26 @@ economical, specific, never performative. When answering questions, cite sources and acknowledge uncertainty rather than filling gaps with plausible-sounding content. -You have access to his complete document corpus, conversation history, -and a persistent memory file that carries his current context. Treat -the memory file as ground truth for his present situation. Use web -search automatically when current information is needed. Never -re-brief on context that's already in memory or documents. +You have a persistent memory file (always present below) that carries +Aaron's current context — treat it as ground truth for his present +situation. + +For anything beyond what's in memory, you have a retrieve_documents +tool that searches his full knowledge base: personal documents, +reading library, conversation transcripts, and journal entries. Call +it whenever you need concrete information — names, dates, project +specifics, prior thinking, exhibition records, syllabi, anything you +don't already know. For compound questions, call it multiple times +with different concrete queries; one call per distinct information +need. Prefer specific tokens (named entities, project names, course +codes) over abstract instructional phrasing — search "FWN3D +consulting" not "my work." Results are unfiltered and ranked by +semantic similarity; judge each chunk for relevance and ignore +irrelevant hits rather than forcing them into the answer. + +Use web search automatically when current external information is +needed. Never re-brief on context that's already in memory or +retrieved chunks. When making factual claims about Aaron — his history, credentials, locations, dates, relationships, projects, or any specific event — you must ground the claim in a specific retrieved document or the memory file. Cite the source by name inline. If no source supports the claim, say so explicitly rather than filling the gap with plausible-sounding content. Do not confabulate. If you are inferring rather than citing, mark it as inference.""" @@ -251,24 +266,6 @@ FINAL_LIMIT = 8 _TSQUERY_SANITIZE_RE = re.compile(r"[^\w\s\"'-]") -CONVERSATION_TYPES = ["chatgpt_conversation", "claude_conversation", "aaronai_conversation"] -DOCUMENT_TYPES = ["document"] -MEMORY_TYPES = ["claude_memory"] - -_CONVO_SIGNALS = ( - "what did i tell", "what did we discuss", "what did we talk", - "in our conversation", "you mentioned", "we talked about", - "earlier you said", "earlier i said", "did i tell you", - "did i say", "what did chatgpt", "what did claude", -) -_DOC_SIGNALS = ( - "write me a bio", "draft a bio", "my bio", "my cv", "my resume", - "my professional", "my work history", "my exhibitions", - "my publications", "my syllabi", "my courses", "my teaching", - "my philosophy", "about my career", "draft a cover letter", - "draft my", "write a bio", "professional bio", -) - def _websearch_query(text: str) -> str: """Strip characters websearch_to_tsquery doesn't handle cleanly. Quoted @@ -276,31 +273,6 @@ def _websearch_query(text: str) -> str: return _TSQUERY_SANITIZE_RE.sub(" ", text).strip() -def classify_retrieval_intent(query: str): - """Return (type_filter, folder_exclude_prefixes). Either may be None. - - type_filter restricts the candidate pool by `type`; folder_exclude_prefixes - excludes any chunk whose metadata.folder matches a LIKE 'prefix%' pattern. - - Implementation is a low-effort keyword classifier — explicitly tunable and - swappable. For nuanced routing, replace with an LLM classifier returning - the same shape. - - Precedence: conversation signals win over document signals — "what did I - tell you about my CV" is asking about the conversation, not the CV. - - For biographical/document intent, also exclude the reference library - (Library/Foundations/* — philosophy and cognition books), which is - categorically different from personal artifacts but lives in the same - `type='document'` bucket.""" - q = query.lower() - if any(s in q for s in _CONVO_SIGNALS): - return (CONVERSATION_TYPES, None) - if any(s in q for s in _DOC_SIGNALS): - return (DOCUMENT_TYPES, ["Library/"]) - return (None, None) - - def _rerank(query: str, candidates: list[tuple]) -> list[tuple]: """Cross-encoder rerank. Candidates are (id, document, source, folder, created_at) tuples. Returns the same tuples reordered by reranker score with created_at as @@ -334,71 +306,49 @@ def _dedup_key(doc: str) -> str: return hashlib.md5(doc[:300].lower().encode("utf-8", "ignore")).hexdigest() -def retrieve_context(query, n_results=FINAL_LIMIT, - type_filter=None, folder_exclude_prefixes=None): +def retrieve_context(query, n_results=FINAL_LIMIT): """Hybrid retrieval (dense + lexical, RRF fused) followed by cross-encoder rerank. - Dense (pgvector) handles paraphrase / semantic similarity. - Lexical (tsvector) catches rare named tokens (FWN3D, Sono-Tek, course codes) the embedding model has no signal for. - RRF combines the two rankings without calibrating score scales. - - Cross-encoder rerank scores each (query, chunk) pair jointly, bridging - semantic gaps that bi-encoders can't (e.g., "write me a bio" -> CV chunk). + - Cross-encoder rerank scores each (query, chunk) pair jointly. - Near-duplicate collapse on output so top-N slots aren't burned by multi-folder copies of the same file. - type_filter: optional list of `type` values to restrict the candidate pool to. - folder_exclude_prefixes: optional list of folder LIKE prefixes to exclude. - Both default to None (no restriction). Use classify_retrieval_intent() to derive.""" + No type or folder filtering: imposing a taxonomy at retrieval time is a + heuristic we've explicitly rejected. The reranker ranks, the caller (LLM) + decides what's relevant to its task.""" query_embedding = embedder.encode([query]).tolist()[0] ts_query = _websearch_query(query) context_pieces = [] sources = [] - where_clauses = [] - extra_params = [] - if type_filter: - where_clauses.append("type = ANY(%s)") - extra_params.append(list(type_filter)) - for prefix in (folder_exclude_prefixes or []): - where_clauses.append("(metadata->>'folder' IS NULL OR metadata->>'folder' NOT LIKE %s)") - extra_params.append(prefix + "%") - - common_where = ("WHERE " + " AND ".join(where_clauses)) if where_clauses else "" - try: pg = get_pg() cur = pg.cursor() - # pgvector 0.6 HNSW doesn't iterate past its initial candidate list when - # a restrictive WHERE filter is present — so a filter that excludes the - # top-N nearest leaves nothing. Bumping ef_search forces the index to - # explore more graph nodes. Cheap when unfiltered; load-bearing when filtered. - if where_clauses: - cur.execute("SET LOCAL hnsw.ef_search = 500") - - cur.execute(f""" + cur.execute(""" SELECT id, document, source, metadata->>'folder' AS folder, created_at FROM embeddings - {common_where} ORDER BY embedding <=> %s::vector LIMIT %s - """, (*extra_params, query_embedding, HYBRID_CANDIDATES)) + """, (query_embedding, HYBRID_CANDIDATES)) dense_hits = cur.fetchall() lexical_hits = [] if ts_query: - lex_match = "to_tsvector('english', document) @@ websearch_to_tsquery('english', %s)" - lex_where = ("WHERE " + " AND ".join([lex_match] + where_clauses)) - cur.execute(f""" + cur.execute(""" SELECT id, document, source, metadata->>'folder' AS folder, created_at FROM embeddings - {lex_where} + WHERE to_tsvector('english', document) + @@ websearch_to_tsquery('english', %s) ORDER BY ts_rank(to_tsvector('english', document), websearch_to_tsquery('english', %s)) DESC LIMIT %s - """, (ts_query, *extra_params, ts_query, HYBRID_CANDIDATES)) + """, (ts_query, ts_query, HYBRID_CANDIDATES)) lexical_hits = cur.fetchall() pg.close() @@ -466,13 +416,51 @@ def create_conversation(title="New conversation"): conn.close() return conv_id +RETRIEVE_DOCUMENTS_TOOL = { + "name": "retrieve_documents", + "description": ( + "Search Aaron's knowledge base — personal documents, reading library, " + "conversation transcripts, and journal entries — for content relevant " + "to a query. Call whenever you need concrete information you don't " + "already have from the persistent memory file. For compound questions " + "(e.g. 'bio emphasizing consulting work and recent research'), call " + "this tool multiple times with different concrete queries; one call " + "per distinct information need. Prefer specific named entities, " + "project names, course codes, or topic-specific terms over abstract " + "instructional phrasing — 'FWN3D consulting' retrieves better than " + "'my work'. Results are ranked by semantic + lexical hybrid retrieval " + "and a cross-encoder reranker; no taxonomy is applied, so judge each " + "returned chunk on its own merits and ignore irrelevant hits." + ), + "input_schema": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query. Use concrete terms.", + }, + }, + "required": ["query"], + }, +} + + +def _execute_retrieve_documents(tool_input): + """Run retrieve_context for a tool call. Returns (tool_result_text, sources).""" + query = (tool_input or {}).get("query", "").strip() + if not query: + return ("No query provided.", []) + pieces, sources = retrieve_context(query) + if not pieces: + return (f"No results for query={query!r}.", []) + parts = [] + for i, (piece, src) in enumerate(zip(pieces, sources), 1): + parts.append(f"[{i}] Source: {src}\n{piece}") + return ("\n\n---\n\n".join(parts), sources) + + def chat(user_message, conversation_id, settings, client_time=None): memory = load_memory() - type_filter, folder_excludes = classify_retrieval_intent(user_message) - context_pieces, sources = retrieve_context( - user_message, type_filter=type_filter, - folder_exclude_prefixes=folder_excludes, - ) history = get_conversation_history(conversation_id) context_parts = [] @@ -480,40 +468,45 @@ def chat(user_message, conversation_id, settings, client_time=None): context_parts.append(f"Current time (user-supplied, not logged): {client_time}") if memory: context_parts.append(f"Aaron's persistent memory:\n\n{memory}") - if context_pieces: - context_str = "\n\n---\n\n".join(context_pieces) - unique_sources = list(set(sources)) - context_parts.append( - f"Relevant excerpts from Aaron's documents:\n\n{context_str}\n\nSources: {', '.join(unique_sources)}" - ) context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else "" full_message = context_block + user_message messages = history + [{"role": "user", "content": full_message}] - tools = [{"type": "web_search_20250305", "name": "web_search"}] if settings.get("web_search", True) else [] + tools = [RETRIEVE_DOCUMENTS_TOOL] + if settings.get("web_search", True): + tools.append({"type": "web_search_20250305", "name": "web_search"}) + + accumulated_sources = [] while True: - kwargs = { - "model": "claude-sonnet-4-6", - "max_tokens": 2048, - "system": SYSTEM_PROMPT, - "messages": messages - } - if tools: - kwargs["tools"] = tools - - response = anthropic_client.messages.create(**kwargs) + response = anthropic_client.messages.create( + model="claude-sonnet-4-6", + max_tokens=2048, + system=SYSTEM_PROMPT, + messages=messages, + tools=tools, + ) if response.stop_reason == "tool_use": messages.append({"role": "assistant", "content": response.content}) tool_results = [] for block in response.content: - if block.type == "tool_use": + if block.type != "tool_use": + continue + if block.name == "retrieve_documents": + result_text, result_sources = _execute_retrieve_documents(block.input) + accumulated_sources.extend(result_sources) tool_results.append({ "type": "tool_result", "tool_use_id": block.id, - "content": "Search completed" + "content": result_text, + }) + else: + tool_results.append({ + "type": "tool_result", + "tool_use_id": block.id, + "content": "Search completed", }) messages.append({"role": "user", "content": tool_results}) else: @@ -521,7 +514,7 @@ def chat(user_message, conversation_id, settings, client_time=None): for block in response.content: if hasattr(block, "text"): assistant_message += block.text - return assistant_message, list(set(sources)) + return assistant_message, list(dict.fromkeys(accumulated_sources)) from contextlib import asynccontextmanager diff --git a/scripts/test_retrieval.py b/scripts/test_retrieval.py index 88ffd75..339f2f1 100644 --- a/scripts/test_retrieval.py +++ b/scripts/test_retrieval.py @@ -14,7 +14,7 @@ load_dotenv(Path.home() / "aaronai" / ".env", override=True) sys.path.insert(0, str(Path(__file__).parent)) # Stub anthropic so api.py import doesn't fail without the SDK loaded. -# We only need retrieve_context + classify_retrieval_intent. +# We only need retrieve_context. import types sys.modules.setdefault("anthropic", types.ModuleType("anthropic")) sys.modules["anthropic"].Anthropic = lambda **kw: None @@ -34,27 +34,20 @@ except Exception as e: print(f"(continuing despite api.py side-effect error: {e})") retrieve_context = api.retrieve_context -classify_retrieval_intent = api.classify_retrieval_intent QUERIES = [ "write me a bio", "my professional bio", - "draft a bio for the Utah application", "Aaron Nelson CV consulting and design work", "FWN3D consulting", "syllabi I have taught", "philosophy of teaching", - "what did I tell Claude about FWN3D", - "what did we discuss about the Utah job", "Hudson Valley Additive Manufacturing Center", + "Aaron Nelson is an artist and educator working in additive manufacturing", ] for q in QUERIES: - type_filter, folder_excludes = classify_retrieval_intent(q) - pieces, sources = retrieve_context( - q, type_filter=type_filter, folder_exclude_prefixes=folder_excludes, - ) + pieces, sources = retrieve_context(q) print(f"\n=== {q!r} ===") - print(f" type_filter: {type_filter} folder_excludes: {folder_excludes}") for i, src in enumerate(sources, 1): print(f" {i}. {src}")