From 9bb083f0659a840c4112d3b5d94b6cb8cfa2583d Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Wed, 20 May 2026 02:22:54 +0000 Subject: [PATCH] chat: cap retrieve_documents per turn, truncate displayed citations, broaden lock-file skip - MAX_RETRIEVALS_PER_TURN (5): after five retrieve_documents calls in a single turn, further calls return a budget-exhausted message instead of executing. Caps cost on runaway multi-query loops without forbidding compound questions. - MAX_CITED_SOURCES (5): accumulated_sources was growing to 14+ entries across multiple tool calls and showing chunks Claude never actually used. Cap the list returned to the UI at 5, preserving insertion order so the highest-relevance early-call results survive. Proper fix (Claude-driven inline citations) is bigger work, noted for later. - ingest.py lock-file skip: changed prefix tuple from ("~$", ".") to ("~", ".") so it catches Office lock files even when Nextcloud's filesystem encoding has mangled the "$" into a unicode replacement char. Matches what watcher.py already does. --- scripts/api.py | 24 +++++++++++++++++++++--- scripts/ingest.py | 4 +++- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/scripts/api.py b/scripts/api.py index 7c0c5cc..b882503 100644 --- a/scripts/api.py +++ b/scripts/api.py @@ -277,6 +277,8 @@ def remove_from_memory(item): HYBRID_CANDIDATES = 30 RRF_K = 60 FINAL_LIMIT = 8 +MAX_RETRIEVALS_PER_TURN = 5 +MAX_CITED_SOURCES = 5 _TSQUERY_SANITIZE_RE = re.compile(r"[^\w\s\"'-]") @@ -636,6 +638,7 @@ def chat(user_message, conversation_id, settings, client_time=None): tools.append({"type": "web_search_20250305", "name": "web_search"}) accumulated_sources = [] + retrieval_count = 0 while True: response = anthropic_client.messages.create( @@ -653,8 +656,17 @@ def chat(user_message, conversation_id, settings, client_time=None): if block.type != "tool_use": continue if block.name == "retrieve_documents": - result_text, result_sources = _execute_retrieve_documents(block.input) - accumulated_sources.extend(result_sources) + if retrieval_count >= MAX_RETRIEVALS_PER_TURN: + result_text = ( + f"Retrieval budget exhausted " + f"({MAX_RETRIEVALS_PER_TURN} calls used this turn). " + "Answer with the information you already have or " + "tell Aaron you need a more focused question." + ) + else: + result_text, result_sources = _execute_retrieve_documents(block.input) + accumulated_sources.extend(result_sources) + retrieval_count += 1 tool_results.append({ "type": "tool_result", "tool_use_id": block.id, @@ -679,7 +691,13 @@ def chat(user_message, conversation_id, settings, client_time=None): for block in response.content: if hasattr(block, "text"): assistant_message += block.text - return assistant_message, list(dict.fromkeys(accumulated_sources)) + # Cap citations: accumulated_sources can grow large across multiple + # retrieve_documents calls and not every chunk that came back was + # actually used in the answer. Insertion order preserves rank + # (each call returns chunks reranker-ordered, so the earliest + # entries are the highest-relevance from the most direct queries). + deduped = list(dict.fromkeys(accumulated_sources)) + return assistant_message, deduped[:MAX_CITED_SOURCES] from contextlib import asynccontextmanager diff --git a/scripts/ingest.py b/scripts/ingest.py index 8b37f8d..7d144bb 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -82,7 +82,9 @@ IGNORED_TOP_FOLDERS = {"Drafts"} def _ingest_one(filepath: Path, embedder, root: Path = None) -> int: """Ingest a single file. Returns chunk count, 0 on skip/failure.""" - if filepath.name.startswith(("~$", ".")): + # "~" catches Office lock files (~$) including the case where Nextcloud + # filesystem encoding has mangled the "$" to a unicode replacement char. + if filepath.name.startswith(("~", ".")): return 0 if filepath.suffix.lower() not in SUPPORTED: return 0