fix(cache): inject cache_control into system prompt for OpenRouter Anthropic (#15151) (#17473)

* fix(cache): inject cache_control into system prompt for OpenRouter Anthropic Add onPayload wrapper that injects cache_control: { type: "ephemeral" } into the system/developer message content for OpenRouter requests routed to Anthropic models. The system prompt is typically ~18k tokens and was being re-processed on every request without caching. Fixes #15151 * Changelog: add OpenRouter note for #17473 --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
2026-05-09 18:14:31 +00:00 · 2026-02-22 19:27:01 +02:00
parent 66529c7aa5
commit c52b2ad5c3
3 changed files with 155 additions and 0 deletions
--- a/src/agents/pi-embedded-runner/extra-params.ts
+++ b/src/agents/pi-embedded-runner/extra-params.ts
@@ -290,6 +290,59 @@ function createAnthropicBetaHeadersWrapper(
  };
 }

+function isOpenRouterAnthropicModel(provider: string, modelId: string): boolean {
+  return provider.toLowerCase() === "openrouter" && modelId.toLowerCase().startsWith("anthropic/");
+}
+
+type PayloadMessage = {
+  role?: string;
+  content?: unknown;
+};
+
+/**
+ * Inject cache_control into the system message for OpenRouter Anthropic models.
+ * OpenRouter passes through Anthropic's cache_control field — caching the system
+ * prompt avoids re-processing it on every request.
+ */
+function createOpenRouterSystemCacheWrapper(baseStreamFn: StreamFn | undefined): StreamFn {
+  const underlying = baseStreamFn ?? streamSimple;
+  return (model, context, options) => {
+    if (
+      typeof model.provider !== "string" ||
+      typeof model.id !== "string" ||
+      !isOpenRouterAnthropicModel(model.provider, model.id)
+    ) {
+      return underlying(model, context, options);
+    }
+
+    const originalOnPayload = options?.onPayload;
+    return underlying(model, context, {
+      ...options,
+      onPayload: (payload) => {
+        const messages = (payload as Record<string, unknown>)?.messages;
+        if (Array.isArray(messages)) {
+          for (const msg of messages as PayloadMessage[]) {
+            if (msg.role !== "system" && msg.role !== "developer") {
+              continue;
+            }
+            if (typeof msg.content === "string") {
+              msg.content = [
+                { type: "text", text: msg.content, cache_control: { type: "ephemeral" } },
+              ];
+            } else if (Array.isArray(msg.content) && msg.content.length > 0) {
+              const last = msg.content[msg.content.length - 1];
+              if (last && typeof last === "object") {
+                (last as Record<string, unknown>).cache_control = { type: "ephemeral" };
+              }
+            }
+          }
+        }
+        originalOnPayload?.(payload);
+      },
+    });
+  };
+}
+
 /**
 * Map OpenClaw's ThinkLevel to OpenRouter's reasoning.effort values.
 * "off" maps to "none"; all other levels pass through as-is.
@@ -426,6 +479,7 @@ export function applyExtraParamsToAgent(
  if (provider === "openrouter") {
    log.debug(`applying OpenRouter app attribution headers for ${provider}/${modelId}`);
    agent.streamFn = createOpenRouterWrapper(agent.streamFn, thinkingLevel);
+    agent.streamFn = createOpenRouterSystemCacheWrapper(agent.streamFn);
  }

  // Enable Z.AI tool_stream for real-time tool call streaming.