mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-19 11:38:38 +00:00
fix(agents): back off before overload failover
This commit is contained in:
@@ -118,7 +118,7 @@ Docs: https://docs.openclaw.ai
|
|||||||
- Security/auth labels: remove token and API-key snippets from user-facing auth status labels so `/status` and `/models` do not expose credential fragments. (#33262) thanks @cu1ch3n.
|
- Security/auth labels: remove token and API-key snippets from user-facing auth status labels so `/status` and `/models` do not expose credential fragments. (#33262) thanks @cu1ch3n.
|
||||||
- Auth/credential semantics: align profile eligibility + probe diagnostics with SecretRef/expiry rules and harden browser download atomic writes. (#33733) thanks @joshavant.
|
- Auth/credential semantics: align profile eligibility + probe diagnostics with SecretRef/expiry rules and harden browser download atomic writes. (#33733) thanks @joshavant.
|
||||||
- Security/audit denyCommands guidance: suggest likely exact node command IDs for unknown `gateway.nodes.denyCommands` entries so ineffective denylist entries are easier to correct. (#29713) thanks @liquidhorizon88-bot.
|
- Security/audit denyCommands guidance: suggest likely exact node command IDs for unknown `gateway.nodes.denyCommands` entries so ineffective denylist entries are easier to correct. (#29713) thanks @liquidhorizon88-bot.
|
||||||
- Agents/overload failover handling: classify overloaded provider failures separately from rate limits/status timeouts and keep overloaded prompt/assistant failures failover-eligible without recording auth-profile failure state, so transient provider overloads do not poison later profile selection on the same provider.
|
- Agents/overload failover handling: classify overloaded provider failures separately from rate limits/status timeouts, add short overload backoff before retry/failover, and keep overloaded prompt/assistant failures out of auth-profile failure state so transient provider overloads do not poison later profile selection on the same provider.
|
||||||
- Docs/security hardening guidance: document Docker `DOCKER-USER` + UFW policy and add cross-linking from Docker install docs for VPS/public-host setups. (#27613) thanks @dorukardahan.
|
- Docs/security hardening guidance: document Docker `DOCKER-USER` + UFW policy and add cross-linking from Docker install docs for VPS/public-host setups. (#27613) thanks @dorukardahan.
|
||||||
- Docs/security threat-model links: replace relative `.md` links with Mintlify-compatible root-relative routes in security docs to prevent broken internal navigation. (#27698) thanks @clawdoo.
|
- Docs/security threat-model links: replace relative `.md` links with Mintlify-compatible root-relative routes in security docs to prevent broken internal navigation. (#27698) thanks @clawdoo.
|
||||||
- Plugins/Update integrity drift: avoid false integrity drift prompts when updating npm-installed plugins from unpinned specs, while keeping drift checks for exact pinned versions. (#37179) Thanks @vincentkoc.
|
- Plugins/Update integrity drift: avoid false integrity drift prompts when updating npm-installed plugins from unpinned specs, while keeping drift checks for exact pinned versions. (#37179) Thanks @vincentkoc.
|
||||||
|
|||||||
@@ -9,11 +9,28 @@ import type { EmbeddedRunAttemptResult } from "./pi-embedded-runner/run/types.js
|
|||||||
|
|
||||||
const runEmbeddedAttemptMock = vi.fn<(params: unknown) => Promise<EmbeddedRunAttemptResult>>();
|
const runEmbeddedAttemptMock = vi.fn<(params: unknown) => Promise<EmbeddedRunAttemptResult>>();
|
||||||
const resolveCopilotApiTokenMock = vi.fn();
|
const resolveCopilotApiTokenMock = vi.fn();
|
||||||
|
const { computeBackoffMock, sleepWithAbortMock } = vi.hoisted(() => ({
|
||||||
|
computeBackoffMock: vi.fn(
|
||||||
|
(
|
||||||
|
_policy: { initialMs: number; maxMs: number; factor: number; jitter: number },
|
||||||
|
_attempt: number,
|
||||||
|
) => 321,
|
||||||
|
),
|
||||||
|
sleepWithAbortMock: vi.fn(async (_ms: number, _abortSignal?: AbortSignal) => undefined),
|
||||||
|
}));
|
||||||
|
|
||||||
vi.mock("./pi-embedded-runner/run/attempt.js", () => ({
|
vi.mock("./pi-embedded-runner/run/attempt.js", () => ({
|
||||||
runEmbeddedAttempt: (params: unknown) => runEmbeddedAttemptMock(params),
|
runEmbeddedAttempt: (params: unknown) => runEmbeddedAttemptMock(params),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
vi.mock("../infra/backoff.js", () => ({
|
||||||
|
computeBackoff: (
|
||||||
|
policy: { initialMs: number; maxMs: number; factor: number; jitter: number },
|
||||||
|
attempt: number,
|
||||||
|
) => computeBackoffMock(policy, attempt),
|
||||||
|
sleepWithAbort: (ms: number, abortSignal?: AbortSignal) => sleepWithAbortMock(ms, abortSignal),
|
||||||
|
}));
|
||||||
|
|
||||||
vi.mock("../providers/github-copilot-token.js", () => ({
|
vi.mock("../providers/github-copilot-token.js", () => ({
|
||||||
DEFAULT_COPILOT_API_BASE_URL: "https://api.individual.githubcopilot.com",
|
DEFAULT_COPILOT_API_BASE_URL: "https://api.individual.githubcopilot.com",
|
||||||
resolveCopilotApiToken: (...args: unknown[]) => resolveCopilotApiTokenMock(...args),
|
resolveCopilotApiToken: (...args: unknown[]) => resolveCopilotApiTokenMock(...args),
|
||||||
@@ -43,6 +60,8 @@ beforeEach(() => {
|
|||||||
vi.useRealTimers();
|
vi.useRealTimers();
|
||||||
runEmbeddedAttemptMock.mockClear();
|
runEmbeddedAttemptMock.mockClear();
|
||||||
resolveCopilotApiTokenMock.mockReset();
|
resolveCopilotApiTokenMock.mockReset();
|
||||||
|
computeBackoffMock.mockClear();
|
||||||
|
sleepWithAbortMock.mockClear();
|
||||||
});
|
});
|
||||||
|
|
||||||
const baseUsage = {
|
const baseUsage = {
|
||||||
@@ -687,6 +706,9 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
|
|||||||
});
|
});
|
||||||
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
||||||
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
|
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
|
||||||
|
expect(computeBackoffMock).toHaveBeenCalledTimes(1);
|
||||||
|
expect(sleepWithAbortMock).toHaveBeenCalledTimes(1);
|
||||||
|
expect(sleepWithAbortMock).toHaveBeenCalledWith(321, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("rotates for overloaded prompt failures across auto-pinned profiles", async () => {
|
it("rotates for overloaded prompt failures across auto-pinned profiles", async () => {
|
||||||
@@ -697,6 +719,9 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
|
|||||||
});
|
});
|
||||||
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
||||||
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
|
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
|
||||||
|
expect(computeBackoffMock).toHaveBeenCalledTimes(1);
|
||||||
|
expect(sleepWithAbortMock).toHaveBeenCalledTimes(1);
|
||||||
|
expect(sleepWithAbortMock).toHaveBeenCalledWith(321, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("rotates on timeout without cooling down the timed-out profile", async () => {
|
it("rotates on timeout without cooling down the timed-out profile", async () => {
|
||||||
@@ -707,6 +732,8 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
|
|||||||
});
|
});
|
||||||
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
|
||||||
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
|
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
|
||||||
|
expect(computeBackoffMock).not.toHaveBeenCalled();
|
||||||
|
expect(sleepWithAbortMock).not.toHaveBeenCalled();
|
||||||
});
|
});
|
||||||
|
|
||||||
it("rotates on bare service unavailable without cooling down the profile", async () => {
|
it("rotates on bare service unavailable without cooling down the profile", async () => {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import { randomBytes } from "node:crypto";
|
import { randomBytes } from "node:crypto";
|
||||||
import fs from "node:fs/promises";
|
import fs from "node:fs/promises";
|
||||||
import type { ThinkLevel } from "../../auto-reply/thinking.js";
|
import type { ThinkLevel } from "../../auto-reply/thinking.js";
|
||||||
|
import { computeBackoff, sleepWithAbort, type BackoffPolicy } from "../../infra/backoff.js";
|
||||||
import { generateSecureToken } from "../../infra/secure-random.js";
|
import { generateSecureToken } from "../../infra/secure-random.js";
|
||||||
import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js";
|
import { getGlobalHookRunner } from "../../plugins/hook-runner-global.js";
|
||||||
import type { PluginHookBeforeAgentStartResult } from "../../plugins/types.js";
|
import type { PluginHookBeforeAgentStartResult } from "../../plugins/types.js";
|
||||||
@@ -77,6 +78,12 @@ type CopilotTokenState = {
|
|||||||
const COPILOT_REFRESH_MARGIN_MS = 5 * 60 * 1000;
|
const COPILOT_REFRESH_MARGIN_MS = 5 * 60 * 1000;
|
||||||
const COPILOT_REFRESH_RETRY_MS = 60 * 1000;
|
const COPILOT_REFRESH_RETRY_MS = 60 * 1000;
|
||||||
const COPILOT_REFRESH_MIN_DELAY_MS = 5 * 1000;
|
const COPILOT_REFRESH_MIN_DELAY_MS = 5 * 1000;
|
||||||
|
const OVERLOAD_FAILOVER_BACKOFF_POLICY: BackoffPolicy = {
|
||||||
|
initialMs: 250,
|
||||||
|
maxMs: 1_500,
|
||||||
|
factor: 2,
|
||||||
|
jitter: 0.2,
|
||||||
|
};
|
||||||
|
|
||||||
// Avoid Anthropic's refusal test token poisoning session transcripts.
|
// Avoid Anthropic's refusal test token poisoning session transcripts.
|
||||||
const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL";
|
const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL";
|
||||||
@@ -720,6 +727,7 @@ export async function runEmbeddedPiAgent(
|
|||||||
let lastRunPromptUsage: ReturnType<typeof normalizeUsage> | undefined;
|
let lastRunPromptUsage: ReturnType<typeof normalizeUsage> | undefined;
|
||||||
let autoCompactionCount = 0;
|
let autoCompactionCount = 0;
|
||||||
let runLoopIterations = 0;
|
let runLoopIterations = 0;
|
||||||
|
let overloadFailoverAttempts = 0;
|
||||||
const maybeMarkAuthProfileFailure = async (failure: {
|
const maybeMarkAuthProfileFailure = async (failure: {
|
||||||
profileId?: string;
|
profileId?: string;
|
||||||
reason?: AuthProfileFailureReason | null;
|
reason?: AuthProfileFailureReason | null;
|
||||||
@@ -746,6 +754,14 @@ export async function runEmbeddedPiAgent(
|
|||||||
}
|
}
|
||||||
return failoverReason;
|
return failoverReason;
|
||||||
};
|
};
|
||||||
|
const maybeBackoffBeforeOverloadFailover = async (reason: FailoverReason | null) => {
|
||||||
|
if (reason !== "overloaded") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
overloadFailoverAttempts += 1;
|
||||||
|
const delayMs = computeBackoff(OVERLOAD_FAILOVER_BACKOFF_POLICY, overloadFailoverAttempts);
|
||||||
|
await sleepWithAbort(delayMs, params.abortSignal);
|
||||||
|
};
|
||||||
try {
|
try {
|
||||||
let authRetryPending = false;
|
let authRetryPending = false;
|
||||||
// Hoisted so the retry-limit error path can use the most recent API total.
|
// Hoisted so the retry-limit error path can use the most recent API total.
|
||||||
@@ -1160,11 +1176,13 @@ export async function runEmbeddedPiAgent(
|
|||||||
profileId: lastProfileId,
|
profileId: lastProfileId,
|
||||||
reason: promptProfileFailureReason,
|
reason: promptProfileFailureReason,
|
||||||
});
|
});
|
||||||
|
const promptFailoverFailure = isFailoverErrorMessage(errorText);
|
||||||
if (
|
if (
|
||||||
isFailoverErrorMessage(errorText) &&
|
promptFailoverFailure &&
|
||||||
promptFailoverReason !== "timeout" &&
|
promptFailoverReason !== "timeout" &&
|
||||||
(await advanceAuthProfile())
|
(await advanceAuthProfile())
|
||||||
) {
|
) {
|
||||||
|
await maybeBackoffBeforeOverloadFailover(promptFailoverReason);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const fallbackThinking = pickFallbackThinkingLevel({
|
const fallbackThinking = pickFallbackThinkingLevel({
|
||||||
@@ -1180,7 +1198,8 @@ export async function runEmbeddedPiAgent(
|
|||||||
}
|
}
|
||||||
// FIX: Throw FailoverError for prompt errors when fallbacks configured
|
// FIX: Throw FailoverError for prompt errors when fallbacks configured
|
||||||
// This enables model fallback for quota/rate limit errors during prompt submission
|
// This enables model fallback for quota/rate limit errors during prompt submission
|
||||||
if (fallbackConfigured && isFailoverErrorMessage(errorText)) {
|
if (fallbackConfigured && promptFailoverFailure) {
|
||||||
|
await maybeBackoffBeforeOverloadFailover(promptFailoverReason);
|
||||||
throw new FailoverError(errorText, {
|
throw new FailoverError(errorText, {
|
||||||
reason: promptFailoverReason ?? "unknown",
|
reason: promptFailoverReason ?? "unknown",
|
||||||
provider,
|
provider,
|
||||||
@@ -1270,10 +1289,12 @@ export async function runEmbeddedPiAgent(
|
|||||||
|
|
||||||
const rotated = await advanceAuthProfile();
|
const rotated = await advanceAuthProfile();
|
||||||
if (rotated) {
|
if (rotated) {
|
||||||
|
await maybeBackoffBeforeOverloadFailover(assistantFailoverReason);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fallbackConfigured) {
|
if (fallbackConfigured) {
|
||||||
|
await maybeBackoffBeforeOverloadFailover(assistantFailoverReason);
|
||||||
// Prefer formatted error message (user-friendly) over raw errorMessage
|
// Prefer formatted error message (user-friendly) over raw errorMessage
|
||||||
const message =
|
const message =
|
||||||
(lastAssistant
|
(lastAssistant
|
||||||
|
|||||||
Reference in New Issue
Block a user