api.py: prompt-cache system prompt and memory across tool_use round-trip
Move persistent memory from the user message into system blocks with cache_control: ephemeral on the last block. The static prefix (system prompt + memory, ~3-5K tokens typically) is identical between the two LLM calls of a tool_use round-trip and stable across turns within the 5-minute cache TTL. Without this, the tool-call retrieval architecture roughly doubled input token cost on retrieval-needed turns (full context billed twice). With cache reads at ~10% of standard input, the duplication cost drops by ~90% — the "twice as expensive" hit becomes "slightly more expensive plus tool overhead." client_time stays in the user message (per-turn dynamic, should not be in the cached prefix).
This commit is contained in:
+22
-7
@@ -463,13 +463,28 @@ def chat(user_message, conversation_id, settings, client_time=None):
|
|||||||
memory = load_memory()
|
memory = load_memory()
|
||||||
history = get_conversation_history(conversation_id)
|
history = get_conversation_history(conversation_id)
|
||||||
|
|
||||||
context_parts = []
|
# System prompt + persistent memory are stable across the tool_use round-trip
|
||||||
if client_time:
|
# and across turns within the 5-minute cache TTL. Putting cache_control on the
|
||||||
context_parts.append(f"Current time (user-supplied, not logged): {client_time}")
|
# last system block creates a cache breakpoint here — the second LLM call in a
|
||||||
|
# tool_use turn reads this prefix from cache (~10% of standard input cost)
|
||||||
|
# instead of re-billing it. Memory lives here (not in the user message) so its
|
||||||
|
# position stays stable for cache hits.
|
||||||
|
system_blocks = [{"type": "text", "text": SYSTEM_PROMPT}]
|
||||||
if memory:
|
if memory:
|
||||||
context_parts.append(f"Aaron's persistent memory:\n\n{memory}")
|
system_blocks.append({
|
||||||
context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else ""
|
"type": "text",
|
||||||
full_message = context_block + user_message
|
"text": f"Aaron's persistent memory:\n\n{memory}",
|
||||||
|
})
|
||||||
|
system_blocks[-1]["cache_control"] = {"type": "ephemeral"}
|
||||||
|
|
||||||
|
# client_time is per-turn dynamic, so it stays out of the cached prefix.
|
||||||
|
if client_time:
|
||||||
|
full_message = (
|
||||||
|
f"Current time (user-supplied, not logged): {client_time}\n\n"
|
||||||
|
f"---\n\n{user_message}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
full_message = user_message
|
||||||
|
|
||||||
messages = history + [{"role": "user", "content": full_message}]
|
messages = history + [{"role": "user", "content": full_message}]
|
||||||
|
|
||||||
@@ -483,7 +498,7 @@ def chat(user_message, conversation_id, settings, client_time=None):
|
|||||||
response = anthropic_client.messages.create(
|
response = anthropic_client.messages.create(
|
||||||
model="claude-sonnet-4-6",
|
model="claude-sonnet-4-6",
|
||||||
max_tokens=2048,
|
max_tokens=2048,
|
||||||
system=SYSTEM_PROMPT,
|
system=system_blocks,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user