fix(agents): skip auth-profile failure on overload

This commit is contained in:
Altay
2026-03-06 14:16:33 +03:00
parent 4a80d48ea9
commit d389977be4
2 changed files with 80 additions and 7 deletions

View File

@@ -252,6 +252,24 @@ const mockFailedThenSuccessfulAttempt = (errorMessage = "rate limit") => {
);
};
const mockPromptErrorThenSuccessfulAttempt = (errorMessage: string) => {
runEmbeddedAttemptMock
.mockResolvedValueOnce(
makeAttempt({
promptError: new Error(errorMessage),
}),
)
.mockResolvedValueOnce(
makeAttempt({
assistantTexts: ["ok"],
lastAssistant: buildAssistant({
stopReason: "stop",
content: [{ type: "text", text: "ok" }],
}),
}),
);
};
async function runAutoPinnedOpenAiTurn(params: {
agentDir: string;
workspaceDir: string;
@@ -320,6 +338,28 @@ async function runAutoPinnedRotationCase(params: {
});
}
async function runAutoPinnedPromptErrorRotationCase(params: {
errorMessage: string;
sessionKey: string;
runId: string;
}) {
runEmbeddedAttemptMock.mockClear();
return withAgentWorkspace(async ({ agentDir, workspaceDir }) => {
await writeAuthStore(agentDir);
mockPromptErrorThenSuccessfulAttempt(params.errorMessage);
await runAutoPinnedOpenAiTurn({
agentDir,
workspaceDir,
sessionKey: params.sessionKey,
runId: params.runId,
});
expect(runEmbeddedAttemptMock).toHaveBeenCalledTimes(2);
const usageStats = await readUsageStats(agentDir);
return { usageStats };
});
}
function mockSingleSuccessfulAttempt() {
runEmbeddedAttemptMock.mockResolvedValueOnce(
makeAttempt({
@@ -639,13 +679,24 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
});
it("rotates for overloaded prompt failures across auto-pinned profiles", async () => {
it("rotates for overloaded assistant failures across auto-pinned profiles", async () => {
const { usageStats } = await runAutoPinnedRotationCase({
errorMessage: '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',
sessionKey: "agent:test:overloaded-rotation",
runId: "run:overloaded-rotation",
});
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
});
it("rotates for overloaded prompt failures across auto-pinned profiles", async () => {
const { usageStats } = await runAutoPinnedPromptErrorRotationCase({
errorMessage: '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',
sessionKey: "agent:test:overloaded-prompt-rotation",
runId: "run:overloaded-prompt-rotation",
});
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
});
it("rotates on timeout without cooling down the timed-out profile", async () => {

View File

@@ -10,6 +10,7 @@ import { resolveOpenClawAgentDir } from "../agent-paths.js";
import { hasConfiguredModelFallbacks } from "../agent-scope.js";
import {
isProfileInCooldown,
type AuthProfileFailureReason,
markAuthProfileFailure,
markAuthProfileGood,
markAuthProfileUsed,
@@ -41,6 +42,7 @@ import {
isLikelyContextOverflowError,
isFailoverAssistantError,
isFailoverErrorMessage,
isOverloadedErrorMessage,
parseImageSizeError,
parseImageDimensionError,
isRateLimitAssistantError,
@@ -721,7 +723,7 @@ export async function runEmbeddedPiAgent(
let runLoopIterations = 0;
const maybeMarkAuthProfileFailure = async (failure: {
profileId?: string;
reason?: Parameters<typeof markAuthProfileFailure>[0]["reason"] | null;
reason?: AuthProfileFailureReason | null;
config?: RunEmbeddedPiAgentParams["config"];
agentDir?: RunEmbeddedPiAgentParams["agentDir"];
}) => {
@@ -737,6 +739,21 @@ export async function runEmbeddedPiAgent(
agentDir,
});
};
const resolveAuthProfileFailureReason = (
errorText: string,
failoverReason: FailoverReason | null,
): AuthProfileFailureReason | null => {
if (!failoverReason || failoverReason === "timeout") {
return null;
}
// Overloaded provider responses currently stay on the rate_limit failover lane
// so existing retry/failover behavior keeps working, but they should not
// be recorded as auth-profile failures.
if (failoverReason === "rate_limit" && isOverloadedErrorMessage(errorText)) {
return null;
}
return failoverReason;
};
try {
let authRetryPending = false;
// Hoisted so the retry-limit error path can use the most recent API total.
@@ -1145,9 +1162,13 @@ export async function runEmbeddedPiAgent(
};
}
const promptFailoverReason = classifyFailoverReason(errorText);
const promptProfileFailureReason = resolveAuthProfileFailureReason(
errorText,
promptFailoverReason,
);
await maybeMarkAuthProfileFailure({
profileId: lastProfileId,
reason: promptFailoverReason,
reason: promptProfileFailureReason,
});
if (
isFailoverErrorMessage(errorText) &&
@@ -1198,6 +1219,10 @@ export async function runEmbeddedPiAgent(
const billingFailure = isBillingAssistantError(lastAssistant);
const failoverFailure = isFailoverAssistantError(lastAssistant);
const assistantFailoverReason = classifyFailoverReason(lastAssistant?.errorMessage ?? "");
const assistantProfileFailureReason = resolveAuthProfileFailureReason(
lastAssistant?.errorMessage ?? "",
assistantFailoverReason,
);
const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError;
const imageDimensionError = parseImageDimensionError(lastAssistant?.errorMessage ?? "");
@@ -1237,10 +1262,7 @@ export async function runEmbeddedPiAgent(
if (shouldRotate) {
if (lastProfileId) {
const reason =
timedOut || assistantFailoverReason === "timeout"
? "timeout"
: (assistantFailoverReason ?? "unknown");
const reason = timedOut ? "timeout" : assistantProfileFailureReason;
// Skip cooldown for timeouts: a timeout is model/network-specific,
// not an auth issue. Marking the profile would poison fallback models
// on the same provider (e.g. gpt-5.3 timeout blocks gpt-5.2).