From c7148f1a664f20392229cbf79338a2e9999f4844 Mon Sep 17 00:00:00 2001 From: Altay Date: Fri, 6 Mar 2026 15:36:03 +0300 Subject: [PATCH] fix(agents): back off before overload failover --- CHANGELOG.md | 2 +- ...pi-agent.auth-profile-rotation.e2e.test.ts | 27 +++++++++++++++++++ src/agents/pi-embedded-runner/run.ts | 25 +++++++++++++++-- 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c17ef9c1c0..5d2cc726f54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -118,7 +118,7 @@ Docs: https://docs.openclaw.ai - Security/auth labels: remove token and API-key snippets from user-facing auth status labels so `/status` and `/models` do not expose credential fragments. (#33262) thanks @cu1ch3n. - Auth/credential semantics: align profile eligibility + probe diagnostics with SecretRef/expiry rules and harden browser download atomic writes. (#33733) thanks @joshavant. - Security/audit denyCommands guidance: suggest likely exact node command IDs for unknown `gateway.nodes.denyCommands` entries so ineffective denylist entries are easier to correct. (#29713) thanks @liquidhorizon88-bot. -- Agents/overload failover handling: classify overloaded provider failures separately from rate limits/status timeouts and keep overloaded prompt/assistant failures failover-eligible without recording auth-profile failure state, so transient provider overloads do not poison later profile selection on the same provider. +- Agents/overload failover handling: classify overloaded provider failures separately from rate limits/status timeouts, add short overload backoff before retry/failover, and keep overloaded prompt/assistant failures out of auth-profile failure state so transient provider overloads do not poison later profile selection on the same provider. - Docs/security hardening guidance: document Docker `DOCKER-USER` + UFW policy and add cross-linking from Docker install docs for VPS/public-host setups. (#27613) thanks @dorukardahan. - Docs/security threat-model links: replace relative `.md` links with Mintlify-compatible root-relative routes in security docs to prevent broken internal navigation. (#27698) thanks @clawdoo. - Plugins/Update integrity drift: avoid false integrity drift prompts when updating npm-installed plugins from unpinned specs, while keeping drift checks for exact pinned versions. (#37179) Thanks @vincentkoc. diff --git a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts index a192cb65ba5..4b9f5d1e090 100644 --- a/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts +++ b/src/agents/pi-embedded-runner.run-embedded-pi-agent.auth-profile-rotation.e2e.test.ts @@ -9,11 +9,28 @@ import type { EmbeddedRunAttemptResult } from "./pi-embedded-runner/run/types.js const runEmbeddedAttemptMock = vi.fn<(params: unknown) => Promise>(); const resolveCopilotApiTokenMock = vi.fn(); +const { computeBackoffMock, sleepWithAbortMock } = vi.hoisted(() => ({ + computeBackoffMock: vi.fn( + ( + _policy: { initialMs: number; maxMs: number; factor: number; jitter: number }, + _attempt: number, + ) => 321, + ), + sleepWithAbortMock: vi.fn(async (_ms: number, _abortSignal?: AbortSignal) => undefined), +})); vi.mock("./pi-embedded-runner/run/attempt.js", () => ({ runEmbeddedAttempt: (params: unknown) => runEmbeddedAttemptMock(params), })); +vi.mock("../infra/backoff.js", () => ({ + computeBackoff: ( + policy: { initialMs: number; maxMs: number; factor: number; jitter: number }, + attempt: number, + ) => computeBackoffMock(policy, attempt), + sleepWithAbort: (ms: number, abortSignal?: AbortSignal) => sleepWithAbortMock(ms, abortSignal), +})); + vi.mock("../providers/github-copilot-token.js", () => ({ DEFAULT_COPILOT_API_BASE_URL: "https://api.individual.githubcopilot.com", resolveCopilotApiToken: (...args: unknown[]) => resolveCopilotApiTokenMock(...args), @@ -43,6 +60,8 @@ beforeEach(() => { vi.useRealTimers(); runEmbeddedAttemptMock.mockClear(); resolveCopilotApiTokenMock.mockReset(); + computeBackoffMock.mockClear(); + sleepWithAbortMock.mockClear(); }); const baseUsage = { @@ -687,6 +706,9 @@ describe("runEmbeddedPiAgent auth profile rotation", () => { }); expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number"); expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined(); + expect(computeBackoffMock).toHaveBeenCalledTimes(1); + expect(sleepWithAbortMock).toHaveBeenCalledTimes(1); + expect(sleepWithAbortMock).toHaveBeenCalledWith(321, undefined); }); it("rotates for overloaded prompt failures across auto-pinned profiles", async () => { @@ -697,6 +719,9 @@ describe("runEmbeddedPiAgent auth profile rotation", () => { }); expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number"); expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined(); + expect(computeBackoffMock).toHaveBeenCalledTimes(1); + expect(sleepWithAbortMock).toHaveBeenCalledTimes(1); + expect(sleepWithAbortMock).toHaveBeenCalledWith(321, undefined); }); it("rotates on timeout without cooling down the timed-out profile", async () => { @@ -707,6 +732,8 @@ describe("runEmbeddedPiAgent auth profile rotation", () => { }); expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number"); expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined(); + expect(computeBackoffMock).not.toHaveBeenCalled(); + expect(sleepWithAbortMock).not.toHaveBeenCalled(); }); it("rotates on bare service unavailable without cooling down the profile", async () => { diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index 1e11632d3eb..019d3e2efc1 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -1,6 +1,7 @@ import { randomBytes } from "node:crypto"; import fs from "node:fs/promises"; import type { ThinkLevel } from "../../auto-reply/thinking.js"; +import { computeBackoff, sleepWithAbort, type BackoffPolicy } from "../../infra/backoff.js"; import { generateSecureToken } from "../../infra/secure-random.js"; import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js"; import type { PluginHookBeforeAgentStartResult } from "../../plugins/types.js"; @@ -77,6 +78,12 @@ type CopilotTokenState = { const COPILOT_REFRESH_MARGIN_MS = 5 * 60 * 1000; const COPILOT_REFRESH_RETRY_MS = 60 * 1000; const COPILOT_REFRESH_MIN_DELAY_MS = 5 * 1000; +const OVERLOAD_FAILOVER_BACKOFF_POLICY: BackoffPolicy = { + initialMs: 250, + maxMs: 1_500, + factor: 2, + jitter: 0.2, +}; // Avoid Anthropic's refusal test token poisoning session transcripts. const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL"; @@ -720,6 +727,7 @@ export async function runEmbeddedPiAgent( let lastRunPromptUsage: ReturnType | undefined; let autoCompactionCount = 0; let runLoopIterations = 0; + let overloadFailoverAttempts = 0; const maybeMarkAuthProfileFailure = async (failure: { profileId?: string; reason?: AuthProfileFailureReason | null; @@ -746,6 +754,14 @@ export async function runEmbeddedPiAgent( } return failoverReason; }; + const maybeBackoffBeforeOverloadFailover = async (reason: FailoverReason | null) => { + if (reason !== "overloaded") { + return; + } + overloadFailoverAttempts += 1; + const delayMs = computeBackoff(OVERLOAD_FAILOVER_BACKOFF_POLICY, overloadFailoverAttempts); + await sleepWithAbort(delayMs, params.abortSignal); + }; try { let authRetryPending = false; // Hoisted so the retry-limit error path can use the most recent API total. @@ -1160,11 +1176,13 @@ export async function runEmbeddedPiAgent( profileId: lastProfileId, reason: promptProfileFailureReason, }); + const promptFailoverFailure = isFailoverErrorMessage(errorText); if ( - isFailoverErrorMessage(errorText) && + promptFailoverFailure && promptFailoverReason !== "timeout" && (await advanceAuthProfile()) ) { + await maybeBackoffBeforeOverloadFailover(promptFailoverReason); continue; } const fallbackThinking = pickFallbackThinkingLevel({ @@ -1180,7 +1198,8 @@ export async function runEmbeddedPiAgent( } // FIX: Throw FailoverError for prompt errors when fallbacks configured // This enables model fallback for quota/rate limit errors during prompt submission - if (fallbackConfigured && isFailoverErrorMessage(errorText)) { + if (fallbackConfigured && promptFailoverFailure) { + await maybeBackoffBeforeOverloadFailover(promptFailoverReason); throw new FailoverError(errorText, { reason: promptFailoverReason ?? "unknown", provider, @@ -1270,10 +1289,12 @@ export async function runEmbeddedPiAgent( const rotated = await advanceAuthProfile(); if (rotated) { + await maybeBackoffBeforeOverloadFailover(assistantFailoverReason); continue; } if (fallbackConfigured) { + await maybeBackoffBeforeOverloadFailover(assistantFailoverReason); // Prefer formatted error message (user-friendly) over raw errorMessage const message = (lastAssistant