fix(agents): skip auth-profile failure on overload

2026-04-18 12:17:26 +00:00 · 2026-03-06 14:16:33 +03:00
parent 4a80d48ea9
commit d389977be4
2 changed files with 80 additions and 7 deletions
--- a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts
+++ b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts
@@ -252,6 +252,24 @@ const mockFailedThenSuccessfulAttempt = (errorMessage = "rate limit") => {
    );
 };

+const mockPromptErrorThenSuccessfulAttempt = (errorMessage: string) => {
+  runEmbeddedAttemptMock
+    .mockResolvedValueOnce(
+      makeAttempt({
+        promptError: new Error(errorMessage),
+      }),
+    )
+    .mockResolvedValueOnce(
+      makeAttempt({
+        assistantTexts: ["ok"],
+        lastAssistant: buildAssistant({
+          stopReason: "stop",
+          content: [{ type: "text", text: "ok" }],
+        }),
+      }),
+    );
+};
+
 async function runAutoPinnedOpenAiTurn(params: {
  agentDir: string;
  workspaceDir: string;
@@ -320,6 +338,28 @@ async function runAutoPinnedRotationCase(params: {
  });
 }

+async function runAutoPinnedPromptErrorRotationCase(params: {
+  errorMessage: string;
+  sessionKey: string;
+  runId: string;
+}) {
+  runEmbeddedAttemptMock.mockClear();
+  return withAgentWorkspace(async ({ agentDir, workspaceDir }) => {
+    await writeAuthStore(agentDir);
+    mockPromptErrorThenSuccessfulAttempt(params.errorMessage);
+    await runAutoPinnedOpenAiTurn({
+      agentDir,
+      workspaceDir,
+      sessionKey: params.sessionKey,
+      runId: params.runId,
+    });
+
+    expect(runEmbeddedAttemptMock).toHaveBeenCalledTimes(2);
+    const usageStats = await readUsageStats(agentDir);
+    return { usageStats };
+  });
+}
+
 function mockSingleSuccessfulAttempt() {
  runEmbeddedAttemptMock.mockResolvedValueOnce(
    makeAttempt({
@@ -639,13 +679,24 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
    expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
  });

-  it("rotates for overloaded prompt failures across auto-pinned profiles", async () => {
+  it("rotates for overloaded assistant failures across auto-pinned profiles", async () => {
    const { usageStats } = await runAutoPinnedRotationCase({
      errorMessage: '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',
      sessionKey: "agent:test:overloaded-rotation",
      runId: "run:overloaded-rotation",
    });
    expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
+    expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
+  });
+
+  it("rotates for overloaded prompt failures across auto-pinned profiles", async () => {
+    const { usageStats } = await runAutoPinnedPromptErrorRotationCase({
+      errorMessage: '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',
+      sessionKey: "agent:test:overloaded-prompt-rotation",
+      runId: "run:overloaded-prompt-rotation",
+    });
+    expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
+    expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
  });

  it("rotates on timeout without cooling down the timed-out profile", async () => {
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@@ -10,6 +10,7 @@ import { resolveOpenClawAgentDir } from "../agent-paths.js";
 import { hasConfiguredModelFallbacks } from "../agent-scope.js";
 import {
  isProfileInCooldown,
+  type AuthProfileFailureReason,
  markAuthProfileFailure,
  markAuthProfileGood,
  markAuthProfileUsed,
@@ -41,6 +42,7 @@ import {
  isLikelyContextOverflowError,
  isFailoverAssistantError,
  isFailoverErrorMessage,
+  isOverloadedErrorMessage,
  parseImageSizeError,
  parseImageDimensionError,
  isRateLimitAssistantError,
@@ -721,7 +723,7 @@ export async function runEmbeddedPiAgent(
      let runLoopIterations = 0;
      const maybeMarkAuthProfileFailure = async (failure: {
        profileId?: string;
-        reason?: Parameters<typeof markAuthProfileFailure>[0]["reason"] | null;
+        reason?: AuthProfileFailureReason | null;
        config?: RunEmbeddedPiAgentParams["config"];
        agentDir?: RunEmbeddedPiAgentParams["agentDir"];
      }) => {
@@ -737,6 +739,21 @@ export async function runEmbeddedPiAgent(
          agentDir,
        });
      };
+      const resolveAuthProfileFailureReason = (
+        errorText: string,
+        failoverReason: FailoverReason | null,
+      ): AuthProfileFailureReason | null => {
+        if (!failoverReason || failoverReason === "timeout") {
+          return null;
+        }
+        // Overloaded provider responses currently stay on the rate_limit failover lane
+        // so existing retry/failover behavior keeps working, but they should not
+        // be recorded as auth-profile failures.
+        if (failoverReason === "rate_limit" && isOverloadedErrorMessage(errorText)) {
+          return null;
+        }
+        return failoverReason;
+      };
      try {
        let authRetryPending = false;
        // Hoisted so the retry-limit error path can use the most recent API total.
@@ -1145,9 +1162,13 @@ export async function runEmbeddedPiAgent(
              };
            }
            const promptFailoverReason = classifyFailoverReason(errorText);
+            const promptProfileFailureReason = resolveAuthProfileFailureReason(
+              errorText,
+              promptFailoverReason,
+            );
            await maybeMarkAuthProfileFailure({
              profileId: lastProfileId,
-              reason: promptFailoverReason,
+              reason: promptProfileFailureReason,
            });
            if (
              isFailoverErrorMessage(errorText) &&
@@ -1198,6 +1219,10 @@ export async function runEmbeddedPiAgent(
          const billingFailure = isBillingAssistantError(lastAssistant);
          const failoverFailure = isFailoverAssistantError(lastAssistant);
          const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? "");
+          const assistantProfileFailureReason = resolveAuthProfileFailureReason(
+            lastAssistant?.errorMessage ?? "",
+            assistantFailoverReason,
+          );
          const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError;
          const imageDimensionError = parseImageDimensionError(lastAssistant?.errorMessage ?? "");

@@ -1237,10 +1262,7 @@ export async function runEmbeddedPiAgent(

          if (shouldRotate) {
            if (lastProfileId) {
-              const reason =
-                timedOut || assistantFailoverReason === "timeout"
-                  ? "timeout"
-                  : (assistantFailoverReason ?? "unknown");
+              const reason = timedOut ? "timeout" : assistantProfileFailureReason;
              // Skip cooldown for timeouts: a timeout is model/network-specific,
              // not an auth issue. Marking the profile would poison fallback models
              // on the same provider (e.g. gpt-5.3 timeout blocks gpt-5.2).