From 9b7fce0249fc752fd852801c7522a0926838ea63 Mon Sep 17 00:00:00 2001 From: zerone0x Date: Mon, 9 Mar 2026 21:11:24 +0800 Subject: [PATCH] fix(auth): reset error counters when cooldown expires to prevent infinite escalation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When multiple models from the same provider share a single auth profile (e.g. zai:default), rate limit errors from model fallback probes accumulate the error count within the active cooldown window. After the cooldown expires, clearExpiredCooldowns() resets the count in-memory, but the on-disk state may still carry the stale count when the lock-based updater reads a fresh store. The next failure then computes backoff from the inflated count (e.g. errorCount=3 → 25 min instead of 1 min), creating an ever-escalating cooldown loop. Fix: in computeNextProfileUsageStats, check whether the previous cooldown/disabled window has expired before computing the next backoff. If it has, reset error counters (mirroring what clearExpiredCooldowns does) so the profile gets a fresh 1-minute window. Fixes #40989 Co-Authored-By: Claude Opus 4.6 --- ...th-profiles.markauthprofilefailure.test.ts | 52 +++++++++++++++++++ src/agents/auth-profiles/usage.test.ts | 15 ++++-- src/agents/auth-profiles/usage.ts | 21 +++++++- 3 files changed, 83 insertions(+), 5 deletions(-) diff --git a/src/agents/auth-profiles.markauthprofilefailure.test.ts b/src/agents/auth-profiles.markauthprofilefailure.test.ts index e5690f75c6a..5c4d73197b3 100644 --- a/src/agents/auth-profiles.markauthprofilefailure.test.ts +++ b/src/agents/auth-profiles.markauthprofilefailure.test.ts @@ -190,6 +190,58 @@ describe("markAuthProfileFailure", () => { } }); + it("resets error count when previous cooldown has expired to prevent escalation", async () => { + const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-auth-")); + try { + const authPath = path.join(agentDir, "auth-profiles.json"); + const now = Date.now(); + // Simulate state left on disk after 3 rapid failures within a 1-min cooldown + // window. The cooldown has since expired, but clearExpiredCooldowns() only + // ran in-memory and never persisted — so disk still carries errorCount: 3. + fs.writeFileSync( + authPath, + JSON.stringify({ + version: 1, + profiles: { + "anthropic:default": { + type: "api_key", + provider: "anthropic", + key: "sk-default", + }, + }, + usageStats: { + "anthropic:default": { + errorCount: 3, + failureCounts: { rate_limit: 3 }, + lastFailureAt: now - 120_000, // 2 minutes ago + cooldownUntil: now - 60_000, // expired 1 minute ago + }, + }, + }), + ); + + const store = ensureAuthProfileStore(agentDir); + await markAuthProfileFailure({ + store, + profileId: "anthropic:default", + reason: "rate_limit", + agentDir, + }); + + const stats = store.usageStats?.["anthropic:default"]; + // Error count should reset to 1 (not escalate to 4) because the + // previous cooldown expired. Cooldown should be ~1 min, not ~60 min. + expect(stats?.errorCount).toBe(1); + expect(stats?.failureCounts?.rate_limit).toBe(1); + const cooldownMs = (stats?.cooldownUntil ?? 0) - now; + // calculateAuthProfileCooldownMs(1) = 60_000 (1 minute) + expect(cooldownMs).toBeLessThan(120_000); + expect(cooldownMs).toBeGreaterThan(0); + } finally { + fs.rmSync(agentDir, { recursive: true, force: true }); + } + }); + it("does not persist cooldown windows for OpenRouter profiles", async () => { await withAuthProfileStore(async ({ agentDir, store }) => { await markAuthProfileFailure({ diff --git a/src/agents/auth-profiles/usage.test.ts b/src/agents/auth-profiles/usage.test.ts index 120f75d3665..261eae6efd5 100644 --- a/src/agents/auth-profiles/usage.test.ts +++ b/src/agents/auth-profiles/usage.test.ts @@ -608,6 +608,10 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () }); } + // When a cooldown/disabled window expires, the error count resets to prevent + // stale counters from escalating the next cooldown (the root cause of + // infinite cooldown loops — see #40989). The next failure should compute + // backoff from errorCount=1, not from the accumulated stale count. const expiredWindowCases = [ { label: "cooldownUntil", @@ -617,7 +621,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () errorCount: 3, lastFailureAt: now - 60_000, }), - expectedUntil: (now: number) => now + 60 * 60 * 1000, + // errorCount resets → calculateAuthProfileCooldownMs(1) = 60_000 + expectedUntil: (now: number) => now + 60_000, readUntil: (stats: WindowStats | undefined) => stats?.cooldownUntil, }, { @@ -630,7 +635,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () failureCounts: { billing: 2 }, lastFailureAt: now - 60_000, }), - expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000, + // errorCount resets, billing count resets to 1 → + // calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h + expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000, readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil, }, { @@ -643,7 +650,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", () failureCounts: { auth_permanent: 2 }, lastFailureAt: now - 60_000, }), - expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000, + // errorCount resets, auth_permanent count resets to 1 → + // calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h + expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000, readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil, }, ]; diff --git a/src/agents/auth-profiles/usage.ts b/src/agents/auth-profiles/usage.ts index c28b51e3e57..19f3a030fad 100644 --- a/src/agents/auth-profiles/usage.ts +++ b/src/agents/auth-profiles/usage.ts @@ -400,9 +400,26 @@ function computeNextProfileUsageStats(params: { params.existing.lastFailureAt > 0 && params.now - params.existing.lastFailureAt > windowMs; - const baseErrorCount = windowExpired ? 0 : (params.existing.errorCount ?? 0); + // If the previous cooldown has already expired, reset error counters so the + // profile gets a fresh backoff window. clearExpiredCooldowns() does this + // in-memory during profile ordering, but the on-disk state may still carry + // the old counters when the lock-based updater reads a fresh store. Without + // this check, stale error counts from an expired cooldown cause the next + // failure to escalate to a much longer cooldown (e.g. 1 min → 25 min). + const previousCooldownExpired = (() => { + const unusableUntil = resolveProfileUnusableUntil(params.existing); + // No cooldown/disabled window was ever set → fresh profile, nothing to expire. + if (unusableUntil === null) { + return false; + } + // The window exists and has expired. + return params.now >= unusableUntil; + })(); + + const shouldResetCounters = windowExpired || previousCooldownExpired; + const baseErrorCount = shouldResetCounters ? 0 : (params.existing.errorCount ?? 0); const nextErrorCount = baseErrorCount + 1; - const failureCounts = windowExpired ? {} : { ...params.existing.failureCounts }; + const failureCounts = shouldResetCounters ? {} : { ...params.existing.failureCounts }; failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1; const updatedStats: ProfileUsageStats = {