Files
openclaw/src/agents/auth-profiles/usage.ts
artale dc69610d51 fix(auth-profiles): never shorten cooldown deadline on retry
When the backoff saturates at 60 min and retries fire every 30 min
(e.g. cron jobs), each failed request was resetting cooldownUntil to
now+60m.  Because now+60m < existing deadline, the window kept getting
renewed and the profile never recovered without manually clearing
usageStats in auth-profiles.json.

Fix: only write a new cooldownUntil (or disabledUntil for billing) when
the new deadline is strictly later than the existing one.  This lets the
original window expire naturally while still allowing genuine backoff
extension when error counts climb further.

Fixes #23516

[AI-assisted]
2026-02-22 13:14:02 +01:00

442 lines
13 KiB
TypeScript

import type { OpenClawConfig } from "../../config/config.js";
import { normalizeProviderId } from "../model-selection.js";
import { saveAuthProfileStore, updateAuthProfileStoreWithLock } from "./store.js";
import type { AuthProfileFailureReason, AuthProfileStore, ProfileUsageStats } from "./types.js";
export function resolveProfileUnusableUntil(
stats: Pick<ProfileUsageStats, "cooldownUntil" | "disabledUntil">,
): number | null {
const values = [stats.cooldownUntil, stats.disabledUntil]
.filter((value): value is number => typeof value === "number")
.filter((value) => Number.isFinite(value) && value > 0);
if (values.length === 0) {
return null;
}
return Math.max(...values);
}
/**
* Check if a profile is currently in cooldown (due to rate limiting or errors).
*/
export function isProfileInCooldown(store: AuthProfileStore, profileId: string): boolean {
const stats = store.usageStats?.[profileId];
if (!stats) {
return false;
}
const unusableUntil = resolveProfileUnusableUntil(stats);
return unusableUntil ? Date.now() < unusableUntil : false;
}
/**
* Return the soonest `unusableUntil` timestamp (ms epoch) among the given
* profiles, or `null` when no profile has a recorded cooldown. Note: the
* returned timestamp may be in the past if the cooldown has already expired.
*/
export function getSoonestCooldownExpiry(
store: AuthProfileStore,
profileIds: string[],
): number | null {
let soonest: number | null = null;
for (const id of profileIds) {
const stats = store.usageStats?.[id];
if (!stats) {
continue;
}
const until = resolveProfileUnusableUntil(stats);
if (typeof until !== "number" || !Number.isFinite(until) || until <= 0) {
continue;
}
if (soonest === null || until < soonest) {
soonest = until;
}
}
return soonest;
}
/**
* Clear expired cooldowns from all profiles in the store.
*
* When `cooldownUntil` or `disabledUntil` has passed, the corresponding fields
* are removed and error counters are reset so the profile gets a fresh start
* (circuit-breaker half-open → closed). Without this, a stale `errorCount`
* causes the *next* transient failure to immediately escalate to a much longer
* cooldown — the root cause of profiles appearing "stuck" after rate limits.
*
* `cooldownUntil` and `disabledUntil` are handled independently: if a profile
* has both and only one has expired, only that field is cleared.
*
* Mutates the in-memory store; disk persistence happens lazily on the next
* store write (e.g. `markAuthProfileUsed` / `markAuthProfileFailure`), which
* matches the existing save pattern throughout the auth-profiles module.
*
* @returns `true` if any profile was modified.
*/
export function clearExpiredCooldowns(store: AuthProfileStore, now?: number): boolean {
const usageStats = store.usageStats;
if (!usageStats) {
return false;
}
const ts = now ?? Date.now();
let mutated = false;
for (const [profileId, stats] of Object.entries(usageStats)) {
if (!stats) {
continue;
}
let profileMutated = false;
const cooldownExpired =
typeof stats.cooldownUntil === "number" &&
Number.isFinite(stats.cooldownUntil) &&
stats.cooldownUntil > 0 &&
ts >= stats.cooldownUntil;
const disabledExpired =
typeof stats.disabledUntil === "number" &&
Number.isFinite(stats.disabledUntil) &&
stats.disabledUntil > 0 &&
ts >= stats.disabledUntil;
if (cooldownExpired) {
stats.cooldownUntil = undefined;
profileMutated = true;
}
if (disabledExpired) {
stats.disabledUntil = undefined;
stats.disabledReason = undefined;
profileMutated = true;
}
// Reset error counters when ALL cooldowns have expired so the profile gets
// a fair retry window. Preserves lastFailureAt for the failureWindowMs
// decay check in computeNextProfileUsageStats.
if (profileMutated && !resolveProfileUnusableUntil(stats)) {
stats.errorCount = 0;
stats.failureCounts = undefined;
}
if (profileMutated) {
usageStats[profileId] = stats;
mutated = true;
}
}
return mutated;
}
/**
* Mark a profile as successfully used. Resets error count and updates lastUsed.
* Uses store lock to avoid overwriting concurrent usage updates.
*/
export async function markAuthProfileUsed(params: {
store: AuthProfileStore;
profileId: string;
agentDir?: string;
}): Promise<void> {
const { store, profileId, agentDir } = params;
const updated = await updateAuthProfileStoreWithLock({
agentDir,
updater: (freshStore) => {
if (!freshStore.profiles[profileId]) {
return false;
}
freshStore.usageStats = freshStore.usageStats ?? {};
freshStore.usageStats[profileId] = {
...freshStore.usageStats[profileId],
lastUsed: Date.now(),
errorCount: 0,
cooldownUntil: undefined,
disabledUntil: undefined,
disabledReason: undefined,
failureCounts: undefined,
};
return true;
},
});
if (updated) {
store.usageStats = updated.usageStats;
return;
}
if (!store.profiles[profileId]) {
return;
}
store.usageStats = store.usageStats ?? {};
store.usageStats[profileId] = {
...store.usageStats[profileId],
lastUsed: Date.now(),
errorCount: 0,
cooldownUntil: undefined,
disabledUntil: undefined,
disabledReason: undefined,
failureCounts: undefined,
};
saveAuthProfileStore(store, agentDir);
}
export function calculateAuthProfileCooldownMs(errorCount: number): number {
const normalized = Math.max(1, errorCount);
return Math.min(
60 * 60 * 1000, // 1 hour max
60 * 1000 * 5 ** Math.min(normalized - 1, 3),
);
}
type ResolvedAuthCooldownConfig = {
billingBackoffMs: number;
billingMaxMs: number;
failureWindowMs: number;
};
function resolveAuthCooldownConfig(params: {
cfg?: OpenClawConfig;
providerId: string;
}): ResolvedAuthCooldownConfig {
const defaults = {
billingBackoffHours: 5,
billingMaxHours: 24,
failureWindowHours: 24,
} as const;
const resolveHours = (value: unknown, fallback: number) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
const cooldowns = params.cfg?.auth?.cooldowns;
const billingOverride = (() => {
const map = cooldowns?.billingBackoffHoursByProvider;
if (!map) {
return undefined;
}
for (const [key, value] of Object.entries(map)) {
if (normalizeProviderId(key) === params.providerId) {
return value;
}
}
return undefined;
})();
const billingBackoffHours = resolveHours(
billingOverride ?? cooldowns?.billingBackoffHours,
defaults.billingBackoffHours,
);
const billingMaxHours = resolveHours(cooldowns?.billingMaxHours, defaults.billingMaxHours);
const failureWindowHours = resolveHours(
cooldowns?.failureWindowHours,
defaults.failureWindowHours,
);
return {
billingBackoffMs: billingBackoffHours * 60 * 60 * 1000,
billingMaxMs: billingMaxHours * 60 * 60 * 1000,
failureWindowMs: failureWindowHours * 60 * 60 * 1000,
};
}
function calculateAuthProfileBillingDisableMsWithConfig(params: {
errorCount: number;
baseMs: number;
maxMs: number;
}): number {
const normalized = Math.max(1, params.errorCount);
const baseMs = Math.max(60_000, params.baseMs);
const maxMs = Math.max(baseMs, params.maxMs);
const exponent = Math.min(normalized - 1, 10);
const raw = baseMs * 2 ** exponent;
return Math.min(maxMs, raw);
}
export function resolveProfileUnusableUntilForDisplay(
store: AuthProfileStore,
profileId: string,
): number | null {
const stats = store.usageStats?.[profileId];
if (!stats) {
return null;
}
return resolveProfileUnusableUntil(stats);
}
function computeNextProfileUsageStats(params: {
existing: ProfileUsageStats;
now: number;
reason: AuthProfileFailureReason;
cfgResolved: ResolvedAuthCooldownConfig;
}): ProfileUsageStats {
const windowMs = params.cfgResolved.failureWindowMs;
const windowExpired =
typeof params.existing.lastFailureAt === "number" &&
params.existing.lastFailureAt > 0 &&
params.now - params.existing.lastFailureAt > windowMs;
const baseErrorCount = windowExpired ? 0 : (params.existing.errorCount ?? 0);
const nextErrorCount = baseErrorCount + 1;
const failureCounts = windowExpired ? {} : { ...params.existing.failureCounts };
failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1;
const updatedStats: ProfileUsageStats = {
...params.existing,
errorCount: nextErrorCount,
failureCounts,
lastFailureAt: params.now,
};
if (params.reason === "billing") {
const billingCount = failureCounts.billing ?? 1;
const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({
errorCount: billingCount,
baseMs: params.cfgResolved.billingBackoffMs,
maxMs: params.cfgResolved.billingMaxMs,
});
const newDisabledUntil = params.now + backoffMs;
// Only advance disabledUntil — never shorten an existing window.
// A retry that fires while the profile is already disabled must not reset
// the deadline to an earlier time; it may extend it if the new backoff is longer.
if (!params.existing.disabledUntil || newDisabledUntil > params.existing.disabledUntil) {
updatedStats.disabledUntil = newDisabledUntil;
}
updatedStats.disabledReason = "billing";
} else {
const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount);
const newCooldownUntil = params.now + backoffMs;
// Only advance cooldownUntil — never shorten an existing window.
// When the backoff saturates (60 min) and retries fire every 30 min, each
// retry was resetting cooldownUntil to now+60m, preventing the profile from
// ever recovering. We only write a new deadline when it is strictly later
// than the one already in the store.
if (!params.existing.cooldownUntil || newCooldownUntil > params.existing.cooldownUntil) {
updatedStats.cooldownUntil = newCooldownUntil;
}
}
return updatedStats;
}
/**
* Mark a profile as failed for a specific reason. Billing failures are treated
* as "disabled" (longer backoff) vs the regular cooldown window.
*/
export async function markAuthProfileFailure(params: {
store: AuthProfileStore;
profileId: string;
reason: AuthProfileFailureReason;
cfg?: OpenClawConfig;
agentDir?: string;
}): Promise<void> {
const { store, profileId, reason, agentDir, cfg } = params;
const updated = await updateAuthProfileStoreWithLock({
agentDir,
updater: (freshStore) => {
const profile = freshStore.profiles[profileId];
if (!profile) {
return false;
}
freshStore.usageStats = freshStore.usageStats ?? {};
const existing = freshStore.usageStats[profileId] ?? {};
const now = Date.now();
const providerKey = normalizeProviderId(profile.provider);
const cfgResolved = resolveAuthCooldownConfig({
cfg,
providerId: providerKey,
});
freshStore.usageStats[profileId] = computeNextProfileUsageStats({
existing,
now,
reason,
cfgResolved,
});
return true;
},
});
if (updated) {
store.usageStats = updated.usageStats;
return;
}
if (!store.profiles[profileId]) {
return;
}
store.usageStats = store.usageStats ?? {};
const existing = store.usageStats[profileId] ?? {};
const now = Date.now();
const providerKey = normalizeProviderId(store.profiles[profileId]?.provider ?? "");
const cfgResolved = resolveAuthCooldownConfig({
cfg,
providerId: providerKey,
});
store.usageStats[profileId] = computeNextProfileUsageStats({
existing,
now,
reason,
cfgResolved,
});
saveAuthProfileStore(store, agentDir);
}
/**
* Mark a profile as failed/rate-limited. Applies exponential backoff cooldown.
* Cooldown times: 1min, 5min, 25min, max 1 hour.
* Uses store lock to avoid overwriting concurrent usage updates.
*/
export async function markAuthProfileCooldown(params: {
store: AuthProfileStore;
profileId: string;
agentDir?: string;
}): Promise<void> {
await markAuthProfileFailure({
store: params.store,
profileId: params.profileId,
reason: "unknown",
agentDir: params.agentDir,
});
}
/**
* Clear cooldown for a profile (e.g., manual reset).
* Uses store lock to avoid overwriting concurrent usage updates.
*/
export async function clearAuthProfileCooldown(params: {
store: AuthProfileStore;
profileId: string;
agentDir?: string;
}): Promise<void> {
const { store, profileId, agentDir } = params;
const updated = await updateAuthProfileStoreWithLock({
agentDir,
updater: (freshStore) => {
if (!freshStore.usageStats?.[profileId]) {
return false;
}
freshStore.usageStats[profileId] = {
...freshStore.usageStats[profileId],
errorCount: 0,
cooldownUntil: undefined,
disabledUntil: undefined,
disabledReason: undefined,
failureCounts: undefined,
};
return true;
},
});
if (updated) {
store.usageStats = updated.usageStats;
return;
}
if (!store.usageStats?.[profileId]) {
return;
}
store.usageStats[profileId] = {
...store.usageStats[profileId],
errorCount: 0,
cooldownUntil: undefined,
disabledUntil: undefined,
disabledReason: undefined,
failureCounts: undefined,
};
saveAuthProfileStore(store, agentDir);
}