fix(agents): handle overloaded failover separately (#38301)

* fix(agents): skip auth-profile failure on overload

* fix(agents): note overload auth-profile fallback fix

* fix(agents): classify overloaded failures separately

* fix(agents): back off before overload failover

* fix(agents): tighten overload probe and backoff state

* fix(agents): persist overloaded cooldown across runs

* fix(agents): tighten overloaded status handling

* test(agents): add overload regression coverage

* fix(agents): restore runner imports after rebase

* test(agents): add overload fallback integration coverage

* fix(agents): harden overloaded failover abort handling

* test(agents): tighten overload classifier coverage

* test(agents): cover all-overloaded fallback exhaustion

* fix(cron): retry overloaded fallback summaries

* fix(cron): treat HTTP 529 as overloaded retry
This commit is contained in:
Altay
2026-03-07 01:42:11 +03:00
committed by GitHub
parent 110ca23bab
commit 6e962d8b9e
36 changed files with 1036 additions and 84 deletions

View File

@@ -293,13 +293,17 @@ export function classifyFailoverReasonFromHttpStatus(
if (status === 408) {
return "timeout";
}
// Keep the status-only path conservative and behavior-preserving.
// Message-path HTTP heuristics are broader and should not leak in here.
if (status === 502 || status === 503 || status === 504) {
if (status === 503) {
if (message && isOverloadedErrorMessage(message)) {
return "overloaded";
}
return "timeout";
}
if (status === 502 || status === 504) {
return "timeout";
}
if (status === 529) {
return "rate_limit";
return "overloaded";
}
if (status === 400) {
// Some providers return quota/balance errors under HTTP 400, so do not
@@ -854,13 +858,6 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
if (isModelNotFoundErrorMessage(raw)) {
return "model_not_found";
}
if (isTransientHttpError(raw)) {
// Treat transient 5xx provider failures as retryable transport issues.
return "timeout";
}
if (isJsonApiInternalServerError(raw)) {
return "timeout";
}
if (isPeriodicUsageLimitErrorMessage(raw)) {
return isBillingErrorMessage(raw) ? "billing" : "rate_limit";
}
@@ -868,7 +865,19 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
return "rate_limit";
}
if (isOverloadedErrorMessage(raw)) {
return "rate_limit";
return "overloaded";
}
if (isTransientHttpError(raw)) {
// 529 is always overloaded, even without explicit overload keywords in the body.
const status = extractLeadingHttpStatus(raw.trim());
if (status?.code === 529) {
return "overloaded";
}
// Treat remaining transient 5xx provider failures as retryable transport issues.
return "timeout";
}
if (isJsonApiInternalServerError(raw)) {
return "timeout";
}
if (isCloudCodeAssistFormatError(raw)) {
return "format";

View File

@@ -5,6 +5,7 @@ export type FailoverReason =
| "auth_permanent"
| "format"
| "rate_limit"
| "overloaded"
| "billing"
| "timeout"
| "model_not_found"