fix(agents): handle overloaded failover separately (#38301)

* fix(agents): skip auth-profile failure on overload

* fix(agents): note overload auth-profile fallback fix

* fix(agents): classify overloaded failures separately

* fix(agents): back off before overload failover

* fix(agents): tighten overload probe and backoff state

* fix(agents): persist overloaded cooldown across runs

* fix(agents): tighten overloaded status handling

* test(agents): add overload regression coverage

* fix(agents): restore runner imports after rebase

* test(agents): add overload fallback integration coverage

* fix(agents): harden overloaded failover abort handling

* test(agents): tighten overload classifier coverage

* test(agents): cover all-overloaded fallback exhaustion

* fix(cron): retry overloaded fallback summaries

* fix(cron): treat HTTP 529 as overloaded retry
This commit is contained in:
Altay
2026-03-07 01:42:11 +03:00
committed by GitHub
parent 110ca23bab
commit 6e962d8b9e
36 changed files with 1036 additions and 84 deletions

View File

@@ -34,7 +34,7 @@ type ModelCandidate = {
};
export type ModelFallbackRunOptions = {
allowRateLimitCooldownProbe?: boolean;
allowTransientCooldownProbe?: boolean;
};
type ModelFallbackRunFn<T> = (
@@ -428,11 +428,11 @@ function resolveCooldownDecision(params: {
}
// For primary: try when requested model or when probe allows.
// For same-provider fallbacks: only relax cooldown on rate_limit, which
// is commonly model-scoped and can recover on a sibling model.
// For same-provider fallbacks: only relax cooldown on transient provider
// limits, which are often model-scoped and can recover on a sibling model.
const shouldAttemptDespiteCooldown =
(params.isPrimary && (!params.requestedModel || shouldProbe)) ||
(!params.isPrimary && inferredReason === "rate_limit");
(!params.isPrimary && (inferredReason === "rate_limit" || inferredReason === "overloaded"));
if (!shouldAttemptDespiteCooldown) {
return {
type: "skip",
@@ -514,8 +514,8 @@ export async function runWithModelFallback<T>(params: {
if (decision.markProbe) {
lastProbeAttempt.set(probeThrottleKey, now);
}
if (decision.reason === "rate_limit") {
runOptions = { allowRateLimitCooldownProbe: true };
if (decision.reason === "rate_limit" || decision.reason === "overloaded") {
runOptions = { allowTransientCooldownProbe: true };
}
}
}