mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-08 07:11:25 +00:00
fix(auth): auto-expire stale auth profile cooldowns and reset error count
When an auth profile hits a rate limit, `errorCount` is incremented and `cooldownUntil` is set with exponential backoff. After the cooldown expires, the time-based check correctly returns false — but `errorCount` persists. The next transient failure immediately escalates to a much longer cooldown because the backoff formula uses the stale count: 60s × 5^(errorCount-1), max 1h This creates a positive feedback loop where profiles appear permanently stuck after rate limits, requiring manual JSON editing to recover. Add `clearExpiredCooldowns()` which sweeps all profiles on every call to `resolveAuthProfileOrder()` and clears expired `cooldownUntil` / `disabledUntil` values along with resetting `errorCount` and `failureCounts` — giving the profile a fair retry window (circuit-breaker half-open → closed transition). Key design decisions: - `cooldownUntil` and `disabledUntil` handled independently (a profile can have both; only the expired one is cleared) - `errorCount` reset only when ALL unusable windows have expired - `lastFailureAt` preserved for the existing failureWindowMs decay logic - In-memory mutation; disk persistence happens lazily on the next store write, matching the existing save pattern Fixes #3604 Related: #13623, #15851, #11972, #8434
This commit is contained in:
@@ -2,7 +2,7 @@ import type { OpenClawConfig } from "../../config/config.js";
|
||||
import type { AuthProfileStore } from "./types.js";
|
||||
import { normalizeProviderId } from "../model-selection.js";
|
||||
import { listProfilesForProvider } from "./profiles.js";
|
||||
import { isProfileInCooldown } from "./usage.js";
|
||||
import { clearExpiredCooldowns, isProfileInCooldown } from "./usage.js";
|
||||
|
||||
function resolveProfileUnusableUntil(stats: {
|
||||
cooldownUntil?: number;
|
||||
@@ -26,6 +26,11 @@ export function resolveAuthProfileOrder(params: {
|
||||
const { cfg, store, provider, preferredProfile } = params;
|
||||
const providerKey = normalizeProviderId(provider);
|
||||
const now = Date.now();
|
||||
|
||||
// Clear any cooldowns that have expired since the last check so profiles
|
||||
// get a fresh error count and are not immediately re-penalized on the
|
||||
// next transient failure. See #3604.
|
||||
clearExpiredCooldowns(store, now);
|
||||
const storedOrder = (() => {
|
||||
const order = store.order;
|
||||
if (!order) {
|
||||
|
||||
269
src/agents/auth-profiles/usage.test.ts
Normal file
269
src/agents/auth-profiles/usage.test.ts
Normal file
@@ -0,0 +1,269 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { AuthProfileStore } from "./types.js";
|
||||
import { clearExpiredCooldowns, isProfileInCooldown } from "./usage.js";
|
||||
|
||||
function makeStore(usageStats: AuthProfileStore["usageStats"]): AuthProfileStore {
|
||||
return {
|
||||
version: 1,
|
||||
profiles: {
|
||||
"anthropic:default": { type: "api_key", provider: "anthropic", key: "sk-test" },
|
||||
"openai:default": { type: "api_key", provider: "openai", key: "sk-test-2" },
|
||||
},
|
||||
usageStats,
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// isProfileInCooldown
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe("isProfileInCooldown", () => {
|
||||
it("returns false when profile has no usage stats", () => {
|
||||
const store = makeStore(undefined);
|
||||
expect(isProfileInCooldown(store, "anthropic:default")).toBe(false);
|
||||
});
|
||||
|
||||
it("returns true when cooldownUntil is in the future", () => {
|
||||
const store = makeStore({
|
||||
"anthropic:default": { cooldownUntil: Date.now() + 60_000 },
|
||||
});
|
||||
expect(isProfileInCooldown(store, "anthropic:default")).toBe(true);
|
||||
});
|
||||
|
||||
it("returns false when cooldownUntil has passed", () => {
|
||||
const store = makeStore({
|
||||
"anthropic:default": { cooldownUntil: Date.now() - 1_000 },
|
||||
});
|
||||
expect(isProfileInCooldown(store, "anthropic:default")).toBe(false);
|
||||
});
|
||||
|
||||
it("returns true when disabledUntil is in the future (even if cooldownUntil expired)", () => {
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: Date.now() - 1_000,
|
||||
disabledUntil: Date.now() + 60_000,
|
||||
},
|
||||
});
|
||||
expect(isProfileInCooldown(store, "anthropic:default")).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// clearExpiredCooldowns
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe("clearExpiredCooldowns", () => {
|
||||
it("returns false on empty usageStats", () => {
|
||||
const store = makeStore(undefined);
|
||||
expect(clearExpiredCooldowns(store)).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false when no profiles have cooldowns", () => {
|
||||
const store = makeStore({
|
||||
"anthropic:default": { lastUsed: Date.now() },
|
||||
});
|
||||
expect(clearExpiredCooldowns(store)).toBe(false);
|
||||
});
|
||||
|
||||
it("returns false when cooldown is still active", () => {
|
||||
const future = Date.now() + 300_000;
|
||||
const store = makeStore({
|
||||
"anthropic:default": { cooldownUntil: future, errorCount: 3 },
|
||||
});
|
||||
|
||||
expect(clearExpiredCooldowns(store)).toBe(false);
|
||||
expect(store.usageStats?.["anthropic:default"]?.cooldownUntil).toBe(future);
|
||||
expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(3);
|
||||
});
|
||||
|
||||
it("clears expired cooldownUntil and resets errorCount", () => {
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: Date.now() - 1_000,
|
||||
errorCount: 4,
|
||||
failureCounts: { rate_limit: 3, timeout: 1 },
|
||||
lastFailureAt: Date.now() - 120_000,
|
||||
},
|
||||
});
|
||||
|
||||
expect(clearExpiredCooldowns(store)).toBe(true);
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
expect(stats?.cooldownUntil).toBeUndefined();
|
||||
expect(stats?.errorCount).toBe(0);
|
||||
expect(stats?.failureCounts).toBeUndefined();
|
||||
// lastFailureAt preserved for failureWindowMs decay
|
||||
expect(stats?.lastFailureAt).toBeDefined();
|
||||
});
|
||||
|
||||
it("clears expired disabledUntil and disabledReason", () => {
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
disabledUntil: Date.now() - 1_000,
|
||||
disabledReason: "billing",
|
||||
errorCount: 2,
|
||||
failureCounts: { billing: 2 },
|
||||
},
|
||||
});
|
||||
|
||||
expect(clearExpiredCooldowns(store)).toBe(true);
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
expect(stats?.disabledUntil).toBeUndefined();
|
||||
expect(stats?.disabledReason).toBeUndefined();
|
||||
expect(stats?.errorCount).toBe(0);
|
||||
expect(stats?.failureCounts).toBeUndefined();
|
||||
});
|
||||
|
||||
it("handles independent expiry: cooldown expired but disabled still active", () => {
|
||||
const future = Date.now() + 3_600_000;
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: Date.now() - 1_000,
|
||||
disabledUntil: future,
|
||||
disabledReason: "billing",
|
||||
errorCount: 5,
|
||||
failureCounts: { rate_limit: 3, billing: 2 },
|
||||
},
|
||||
});
|
||||
|
||||
expect(clearExpiredCooldowns(store)).toBe(true);
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
// cooldownUntil cleared
|
||||
expect(stats?.cooldownUntil).toBeUndefined();
|
||||
// disabledUntil still active — not touched
|
||||
expect(stats?.disabledUntil).toBe(future);
|
||||
expect(stats?.disabledReason).toBe("billing");
|
||||
// errorCount NOT reset because profile still has an active unusable window
|
||||
expect(stats?.errorCount).toBe(5);
|
||||
expect(stats?.failureCounts).toEqual({ rate_limit: 3, billing: 2 });
|
||||
});
|
||||
|
||||
it("handles independent expiry: disabled expired but cooldown still active", () => {
|
||||
const future = Date.now() + 300_000;
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: future,
|
||||
disabledUntil: Date.now() - 1_000,
|
||||
disabledReason: "billing",
|
||||
errorCount: 3,
|
||||
},
|
||||
});
|
||||
|
||||
expect(clearExpiredCooldowns(store)).toBe(true);
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
expect(stats?.cooldownUntil).toBe(future);
|
||||
expect(stats?.disabledUntil).toBeUndefined();
|
||||
expect(stats?.disabledReason).toBeUndefined();
|
||||
// errorCount NOT reset because cooldown is still active
|
||||
expect(stats?.errorCount).toBe(3);
|
||||
});
|
||||
|
||||
it("resets errorCount only when both cooldown and disabled have expired", () => {
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: Date.now() - 2_000,
|
||||
disabledUntil: Date.now() - 1_000,
|
||||
disabledReason: "billing",
|
||||
errorCount: 4,
|
||||
failureCounts: { rate_limit: 2, billing: 2 },
|
||||
},
|
||||
});
|
||||
|
||||
expect(clearExpiredCooldowns(store)).toBe(true);
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
expect(stats?.cooldownUntil).toBeUndefined();
|
||||
expect(stats?.disabledUntil).toBeUndefined();
|
||||
expect(stats?.disabledReason).toBeUndefined();
|
||||
expect(stats?.errorCount).toBe(0);
|
||||
expect(stats?.failureCounts).toBeUndefined();
|
||||
});
|
||||
|
||||
it("processes multiple profiles independently", () => {
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: Date.now() - 1_000,
|
||||
errorCount: 3,
|
||||
},
|
||||
"openai:default": {
|
||||
cooldownUntil: Date.now() + 300_000,
|
||||
errorCount: 2,
|
||||
},
|
||||
});
|
||||
|
||||
expect(clearExpiredCooldowns(store)).toBe(true);
|
||||
|
||||
// Anthropic: expired → cleared
|
||||
expect(store.usageStats?.["anthropic:default"]?.cooldownUntil).toBeUndefined();
|
||||
expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(0);
|
||||
|
||||
// OpenAI: still active → untouched
|
||||
expect(store.usageStats?.["openai:default"]?.cooldownUntil).toBeGreaterThan(Date.now());
|
||||
expect(store.usageStats?.["openai:default"]?.errorCount).toBe(2);
|
||||
});
|
||||
|
||||
it("accepts an explicit `now` timestamp for deterministic testing", () => {
|
||||
const fixedNow = 1_700_000_000_000;
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: fixedNow - 1,
|
||||
errorCount: 2,
|
||||
},
|
||||
});
|
||||
|
||||
expect(clearExpiredCooldowns(store, fixedNow)).toBe(true);
|
||||
expect(store.usageStats?.["anthropic:default"]?.cooldownUntil).toBeUndefined();
|
||||
expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(0);
|
||||
});
|
||||
|
||||
it("does not clear cooldownUntil that equals exactly `now`", () => {
|
||||
const fixedNow = 1_700_000_000_000;
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: fixedNow,
|
||||
errorCount: 2,
|
||||
},
|
||||
});
|
||||
|
||||
// ts >= cooldownUntil → should clear (cooldown "until" means the instant
|
||||
// at cooldownUntil the profile becomes available again).
|
||||
expect(clearExpiredCooldowns(store, fixedNow)).toBe(true);
|
||||
expect(store.usageStats?.["anthropic:default"]?.cooldownUntil).toBeUndefined();
|
||||
expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(0);
|
||||
});
|
||||
|
||||
it("ignores NaN and Infinity cooldown values", () => {
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: NaN,
|
||||
errorCount: 2,
|
||||
},
|
||||
"openai:default": {
|
||||
cooldownUntil: Infinity,
|
||||
errorCount: 3,
|
||||
},
|
||||
});
|
||||
|
||||
expect(clearExpiredCooldowns(store)).toBe(false);
|
||||
expect(store.usageStats?.["anthropic:default"]?.errorCount).toBe(2);
|
||||
expect(store.usageStats?.["openai:default"]?.errorCount).toBe(3);
|
||||
});
|
||||
|
||||
it("ignores zero and negative cooldown values", () => {
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: 0,
|
||||
errorCount: 1,
|
||||
},
|
||||
"openai:default": {
|
||||
cooldownUntil: -1,
|
||||
errorCount: 1,
|
||||
},
|
||||
});
|
||||
|
||||
expect(clearExpiredCooldowns(store)).toBe(false);
|
||||
});
|
||||
});
|
||||
@@ -51,6 +51,77 @@ export function getSoonestCooldownExpiry(
|
||||
return soonest;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear expired cooldowns from all profiles in the store.
|
||||
*
|
||||
* When `cooldownUntil` or `disabledUntil` has passed, the corresponding fields
|
||||
* are removed and error counters are reset so the profile gets a fresh start
|
||||
* (circuit-breaker half-open → closed). Without this, a stale `errorCount`
|
||||
* causes the *next* transient failure to immediately escalate to a much longer
|
||||
* cooldown — the root cause of profiles appearing "stuck" after rate limits.
|
||||
*
|
||||
* `cooldownUntil` and `disabledUntil` are handled independently: if a profile
|
||||
* has both and only one has expired, only that field is cleared.
|
||||
*
|
||||
* Mutates the in-memory store; disk persistence happens lazily on the next
|
||||
* store write (e.g. `markAuthProfileUsed` / `markAuthProfileFailure`), which
|
||||
* matches the existing save pattern throughout the auth-profiles module.
|
||||
*
|
||||
* @returns `true` if any profile was modified.
|
||||
*/
|
||||
export function clearExpiredCooldowns(store: AuthProfileStore, now?: number): boolean {
|
||||
const usageStats = store.usageStats;
|
||||
if (!usageStats) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const ts = now ?? Date.now();
|
||||
let mutated = false;
|
||||
|
||||
for (const [profileId, stats] of Object.entries(usageStats)) {
|
||||
if (!stats) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let profileMutated = false;
|
||||
const cooldownExpired =
|
||||
typeof stats.cooldownUntil === "number" &&
|
||||
Number.isFinite(stats.cooldownUntil) &&
|
||||
stats.cooldownUntil > 0 &&
|
||||
ts >= stats.cooldownUntil;
|
||||
const disabledExpired =
|
||||
typeof stats.disabledUntil === "number" &&
|
||||
Number.isFinite(stats.disabledUntil) &&
|
||||
stats.disabledUntil > 0 &&
|
||||
ts >= stats.disabledUntil;
|
||||
|
||||
if (cooldownExpired) {
|
||||
stats.cooldownUntil = undefined;
|
||||
profileMutated = true;
|
||||
}
|
||||
if (disabledExpired) {
|
||||
stats.disabledUntil = undefined;
|
||||
stats.disabledReason = undefined;
|
||||
profileMutated = true;
|
||||
}
|
||||
|
||||
// Reset error counters when ALL cooldowns have expired so the profile gets
|
||||
// a fair retry window. Preserves lastFailureAt for the failureWindowMs
|
||||
// decay check in computeNextProfileUsageStats.
|
||||
if (profileMutated && !resolveProfileUnusableUntil(stats)) {
|
||||
stats.errorCount = 0;
|
||||
stats.failureCounts = undefined;
|
||||
}
|
||||
|
||||
if (profileMutated) {
|
||||
usageStats[profileId] = stats;
|
||||
mutated = true;
|
||||
}
|
||||
}
|
||||
|
||||
return mutated;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark a profile as successfully used. Resets error count and updates lastUsed.
|
||||
* Uses store lock to avoid overwriting concurrent usage updates.
|
||||
|
||||
Reference in New Issue
Block a user