fix(auth-profiles): never shorten cooldown deadline on retry

When the backoff saturates at 60 min and retries fire every 30 min
(e.g. cron jobs), each failed request was resetting cooldownUntil to
now+60m.  Because now+60m < existing deadline, the window kept getting
renewed and the profile never recovered without manually clearing
usageStats in auth-profiles.json.

Fix: only write a new cooldownUntil (or disabledUntil for billing) when
the new deadline is strictly later than the existing one.  This lets the
original window expire naturally while still allowing genuine backoff
extension when error counts climb further.

Fixes #23516

[AI-assisted]
This commit is contained in:
artale
2026-02-22 12:53:44 +01:00
committed by Peter Steinberger
parent bec059f7b2
commit dc69610d51
3 changed files with 114 additions and 2 deletions

View File

@@ -4,6 +4,7 @@ import {
clearAuthProfileCooldown,
clearExpiredCooldowns,
isProfileInCooldown,
markAuthProfileFailure,
resolveProfileUnusableUntil,
} from "./usage.js";
@@ -347,3 +348,99 @@ describe("clearAuthProfileCooldown", () => {
expect(store.usageStats).toBeUndefined();
});
});
describe("markAuthProfileFailure — cooldown is never reset to an earlier deadline", () => {
// Regression for https://github.com/openclaw/openclaw/issues/23516
// When all providers are at saturation backoff (60 min) and retries fire every 30 min,
// each retry was resetting cooldownUntil to now+60m, preventing recovery.
it("does not shorten an existing cooldown when a retry fires mid-window", async () => {
const now = 1_000_000;
// Profile already has 50 min remaining on its cooldown
const existingCooldownUntil = now + 50 * 60 * 1000;
const store = makeStore({
"anthropic:default": {
cooldownUntil: existingCooldownUntil,
errorCount: 3, // already at saturation (60-min backoff)
lastFailureAt: now - 10 * 60 * 1000,
},
});
vi.useFakeTimers();
vi.setSystemTime(now);
try {
await markAuthProfileFailure({
store,
profileId: "anthropic:default",
reason: "rate_limit",
});
} finally {
vi.useRealTimers();
}
const stats = store.usageStats?.["anthropic:default"];
// cooldownUntil must NOT have been reset to now+60m (= now+3_600_000 < existingCooldownUntil)
// It should remain at the original deadline or be extended, never shortened.
expect(stats?.cooldownUntil).toBeGreaterThanOrEqual(existingCooldownUntil);
});
it("does extend cooldownUntil when the new backoff would end later", async () => {
const now = 1_000_000;
// Profile has only 5 min remaining but the next backoff level gives 60 min
const existingCooldownUntil = now + 5 * 60 * 1000;
const store = makeStore({
"anthropic:default": {
cooldownUntil: existingCooldownUntil,
errorCount: 2, // next step: 60-min backoff
lastFailureAt: now - 60_000,
},
});
vi.useFakeTimers();
vi.setSystemTime(now);
try {
await markAuthProfileFailure({
store,
profileId: "anthropic:default",
reason: "rate_limit",
});
} finally {
vi.useRealTimers();
}
const stats = store.usageStats?.["anthropic:default"];
// now+60min > existingCooldownUntil (now+5min), so it should be extended
expect(stats?.cooldownUntil).toBeGreaterThan(existingCooldownUntil);
});
it("does not shorten an existing disabledUntil on a billing retry", async () => {
const now = 1_000_000;
// Profile already has 20 hours remaining on a billing disable
const existingDisabledUntil = now + 20 * 60 * 60 * 1000;
const store = makeStore({
"anthropic:default": {
disabledUntil: existingDisabledUntil,
disabledReason: "billing",
errorCount: 5,
failureCounts: { billing: 5 },
lastFailureAt: now - 60_000,
},
});
vi.useFakeTimers();
vi.setSystemTime(now);
try {
await markAuthProfileFailure({
store,
profileId: "anthropic:default",
reason: "billing",
});
} finally {
vi.useRealTimers();
}
const stats = store.usageStats?.["anthropic:default"];
// disabledUntil must not have been shortened
expect(stats?.disabledUntil).toBeGreaterThanOrEqual(existingDisabledUntil);
});
});

View File

@@ -287,11 +287,25 @@ function computeNextProfileUsageStats(params: {
baseMs: params.cfgResolved.billingBackoffMs,
maxMs: params.cfgResolved.billingMaxMs,
});
updatedStats.disabledUntil = params.now + backoffMs;
const newDisabledUntil = params.now + backoffMs;
// Only advance disabledUntil — never shorten an existing window.
// A retry that fires while the profile is already disabled must not reset
// the deadline to an earlier time; it may extend it if the new backoff is longer.
if (!params.existing.disabledUntil || newDisabledUntil > params.existing.disabledUntil) {
updatedStats.disabledUntil = newDisabledUntil;
}
updatedStats.disabledReason = "billing";
} else {
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
updatedStats.cooldownUntil = params.now + backoffMs;
const newCooldownUntil = params.now + backoffMs;
// Only advance cooldownUntil — never shorten an existing window.
// When the backoff saturates (60 min) and retries fire every 30 min, each
// retry was resetting cooldownUntil to now+60m, preventing the profile from
// ever recovering. We only write a new deadline when it is strictly later
// than the one already in the store.
if (!params.existing.cooldownUntil || newCooldownUntil > params.existing.cooldownUntil) {
updatedStats.cooldownUntil = newCooldownUntil;
}
}
return updatedStats;