mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-30 05:54:43 +00:00
fix(auth-profiles): never shorten cooldown deadline on retry
When the backoff saturates at 60 min and retries fire every 30 min (e.g. cron jobs), each failed request was resetting cooldownUntil to now+60m. Because now+60m < existing deadline, the window kept getting renewed and the profile never recovered without manually clearing usageStats in auth-profiles.json. Fix: only write a new cooldownUntil (or disabledUntil for billing) when the new deadline is strictly later than the existing one. This lets the original window expire naturally while still allowing genuine backoff extension when error counts climb further. Fixes #23516 [AI-assisted]
This commit is contained in:
committed by
Peter Steinberger
parent
bec059f7b2
commit
dc69610d51
@@ -19,6 +19,7 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Auth/Profiles: prevent cooldown deadline from being reset on every retry when the backoff is already saturated. Previously each failed request overwrote `cooldownUntil` with `now + backoffMs`, so a 60-minute cooldown was perpetually extended by cron or inbound retries, trapping the gateway in an unrecoverable loop that required manual `usageStats` deletion to resolve. (#23516)
|
||||
- Channels/Security: fail closed on missing provider group policy config by defaulting runtime group policy to `allowlist` (instead of inheriting `channels.defaults.groupPolicy`) when `channels.<provider>` is absent across message channels, and align runtime + security warnings/docs to the same fallback behavior (Slack, Discord, iMessage, Telegram, WhatsApp, Signal, LINE, Matrix, Mattermost, Google Chat, IRC, Nextcloud Talk, Feishu, and Zalo user flows; plus Discord message/native-command paths). (#23367) Thanks @bmendonca3.
|
||||
- Gateway/Onboarding: harden remote gateway onboarding defaults and guidance by defaulting discovered direct URLs to `wss://`, rejecting insecure non-loopback `ws://` targets in onboarding validation, and expanding remote-security remediation messaging across gateway client/call/doctor flows. (#23476) Thanks @bmendonca3.
|
||||
- CLI/Sessions: pass the configured sessions directory when resolving transcript paths in `agentCommand`, so custom `session.store` locations resume sessions reliably. Thanks @davidrudduck.
|
||||
|
||||
@@ -4,6 +4,7 @@ import {
|
||||
clearAuthProfileCooldown,
|
||||
clearExpiredCooldowns,
|
||||
isProfileInCooldown,
|
||||
markAuthProfileFailure,
|
||||
resolveProfileUnusableUntil,
|
||||
} from "./usage.js";
|
||||
|
||||
@@ -347,3 +348,99 @@ describe("clearAuthProfileCooldown", () => {
|
||||
expect(store.usageStats).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe("markAuthProfileFailure — cooldown is never reset to an earlier deadline", () => {
|
||||
// Regression for https://github.com/openclaw/openclaw/issues/23516
|
||||
// When all providers are at saturation backoff (60 min) and retries fire every 30 min,
|
||||
// each retry was resetting cooldownUntil to now+60m, preventing recovery.
|
||||
|
||||
it("does not shorten an existing cooldown when a retry fires mid-window", async () => {
|
||||
const now = 1_000_000;
|
||||
// Profile already has 50 min remaining on its cooldown
|
||||
const existingCooldownUntil = now + 50 * 60 * 1000;
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: existingCooldownUntil,
|
||||
errorCount: 3, // already at saturation (60-min backoff)
|
||||
lastFailureAt: now - 10 * 60 * 1000,
|
||||
},
|
||||
});
|
||||
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(now);
|
||||
try {
|
||||
await markAuthProfileFailure({
|
||||
store,
|
||||
profileId: "anthropic:default",
|
||||
reason: "rate_limit",
|
||||
});
|
||||
} finally {
|
||||
vi.useRealTimers();
|
||||
}
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
// cooldownUntil must NOT have been reset to now+60m (= now+3_600_000 < existingCooldownUntil)
|
||||
// It should remain at the original deadline or be extended, never shortened.
|
||||
expect(stats?.cooldownUntil).toBeGreaterThanOrEqual(existingCooldownUntil);
|
||||
});
|
||||
|
||||
it("does extend cooldownUntil when the new backoff would end later", async () => {
|
||||
const now = 1_000_000;
|
||||
// Profile has only 5 min remaining but the next backoff level gives 60 min
|
||||
const existingCooldownUntil = now + 5 * 60 * 1000;
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
cooldownUntil: existingCooldownUntil,
|
||||
errorCount: 2, // next step: 60-min backoff
|
||||
lastFailureAt: now - 60_000,
|
||||
},
|
||||
});
|
||||
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(now);
|
||||
try {
|
||||
await markAuthProfileFailure({
|
||||
store,
|
||||
profileId: "anthropic:default",
|
||||
reason: "rate_limit",
|
||||
});
|
||||
} finally {
|
||||
vi.useRealTimers();
|
||||
}
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
// now+60min > existingCooldownUntil (now+5min), so it should be extended
|
||||
expect(stats?.cooldownUntil).toBeGreaterThan(existingCooldownUntil);
|
||||
});
|
||||
|
||||
it("does not shorten an existing disabledUntil on a billing retry", async () => {
|
||||
const now = 1_000_000;
|
||||
// Profile already has 20 hours remaining on a billing disable
|
||||
const existingDisabledUntil = now + 20 * 60 * 60 * 1000;
|
||||
const store = makeStore({
|
||||
"anthropic:default": {
|
||||
disabledUntil: existingDisabledUntil,
|
||||
disabledReason: "billing",
|
||||
errorCount: 5,
|
||||
failureCounts: { billing: 5 },
|
||||
lastFailureAt: now - 60_000,
|
||||
},
|
||||
});
|
||||
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(now);
|
||||
try {
|
||||
await markAuthProfileFailure({
|
||||
store,
|
||||
profileId: "anthropic:default",
|
||||
reason: "billing",
|
||||
});
|
||||
} finally {
|
||||
vi.useRealTimers();
|
||||
}
|
||||
|
||||
const stats = store.usageStats?.["anthropic:default"];
|
||||
// disabledUntil must not have been shortened
|
||||
expect(stats?.disabledUntil).toBeGreaterThanOrEqual(existingDisabledUntil);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -287,11 +287,25 @@ function computeNextProfileUsageStats(params: {
|
||||
baseMs: params.cfgResolved.billingBackoffMs,
|
||||
maxMs: params.cfgResolved.billingMaxMs,
|
||||
});
|
||||
updatedStats.disabledUntil = params.now + backoffMs;
|
||||
const newDisabledUntil = params.now + backoffMs;
|
||||
// Only advance disabledUntil — never shorten an existing window.
|
||||
// A retry that fires while the profile is already disabled must not reset
|
||||
// the deadline to an earlier time; it may extend it if the new backoff is longer.
|
||||
if (!params.existing.disabledUntil || newDisabledUntil > params.existing.disabledUntil) {
|
||||
updatedStats.disabledUntil = newDisabledUntil;
|
||||
}
|
||||
updatedStats.disabledReason = "billing";
|
||||
} else {
|
||||
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
|
||||
updatedStats.cooldownUntil = params.now + backoffMs;
|
||||
const newCooldownUntil = params.now + backoffMs;
|
||||
// Only advance cooldownUntil — never shorten an existing window.
|
||||
// When the backoff saturates (60 min) and retries fire every 30 min, each
|
||||
// retry was resetting cooldownUntil to now+60m, preventing the profile from
|
||||
// ever recovering. We only write a new deadline when it is strictly later
|
||||
// than the one already in the store.
|
||||
if (!params.existing.cooldownUntil || newCooldownUntil > params.existing.cooldownUntil) {
|
||||
updatedStats.cooldownUntil = newCooldownUntil;
|
||||
}
|
||||
}
|
||||
|
||||
return updatedStats;
|
||||
|
||||
Reference in New Issue
Block a user