From 84994f92827e411e7be9d10ce6a1b85e65046f56 Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Tue, 19 May 2026 23:13:43 +0000 Subject: [PATCH] api.py: prompt-cache system prompt and memory across tool_use round-trip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move persistent memory from the user message into system blocks with cache_control: ephemeral on the last block. The static prefix (system prompt + memory, ~3-5K tokens typically) is identical between the two LLM calls of a tool_use round-trip and stable across turns within the 5-minute cache TTL. Without this, the tool-call retrieval architecture roughly doubled input token cost on retrieval-needed turns (full context billed twice). With cache reads at ~10% of standard input, the duplication cost drops by ~90% — the "twice as expensive" hit becomes "slightly more expensive plus tool overhead." client_time stays in the user message (per-turn dynamic, should not be in the cached prefix). --- scripts/api.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/scripts/api.py b/scripts/api.py index e1f25cd..8ee05c9 100644 --- a/scripts/api.py +++ b/scripts/api.py @@ -463,13 +463,28 @@ def chat(user_message, conversation_id, settings, client_time=None): memory = load_memory() history = get_conversation_history(conversation_id) - context_parts = [] - if client_time: - context_parts.append(f"Current time (user-supplied, not logged): {client_time}") + # System prompt + persistent memory are stable across the tool_use round-trip + # and across turns within the 5-minute cache TTL. Putting cache_control on the + # last system block creates a cache breakpoint here — the second LLM call in a + # tool_use turn reads this prefix from cache (~10% of standard input cost) + # instead of re-billing it. Memory lives here (not in the user message) so its + # position stays stable for cache hits. + system_blocks = [{"type": "text", "text": SYSTEM_PROMPT}] if memory: - context_parts.append(f"Aaron's persistent memory:\n\n{memory}") - context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else "" - full_message = context_block + user_message + system_blocks.append({ + "type": "text", + "text": f"Aaron's persistent memory:\n\n{memory}", + }) + system_blocks[-1]["cache_control"] = {"type": "ephemeral"} + + # client_time is per-turn dynamic, so it stays out of the cached prefix. + if client_time: + full_message = ( + f"Current time (user-supplied, not logged): {client_time}\n\n" + f"---\n\n{user_message}" + ) + else: + full_message = user_message messages = history + [{"role": "user", "content": full_message}] @@ -483,7 +498,7 @@ def chat(user_message, conversation_id, settings, client_time=None): response = anthropic_client.messages.create( model="claude-sonnet-4-6", max_tokens=2048, - system=SYSTEM_PROMPT, + system=system_blocks, messages=messages, tools=tools, )