mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-11 14:51:42 +00:00
fix(agents): handle overloaded failover separately (#38301)
* fix(agents): skip auth-profile failure on overload * fix(agents): note overload auth-profile fallback fix * fix(agents): classify overloaded failures separately * fix(agents): back off before overload failover * fix(agents): tighten overload probe and backoff state * fix(agents): persist overloaded cooldown across runs * fix(agents): tighten overloaded status handling * test(agents): add overload regression coverage * fix(agents): restore runner imports after rebase * test(agents): add overload fallback integration coverage * fix(agents): harden overloaded failover abort handling * test(agents): tighten overload classifier coverage * test(agents): cover all-overloaded fallback exhaustion * fix(cron): retry overloaded fallback summaries * fix(cron): treat HTTP 529 as overloaded retry
This commit is contained in:
@@ -293,13 +293,17 @@ export function classifyFailoverReasonFromHttpStatus(
|
||||
if (status === 408) {
|
||||
return "timeout";
|
||||
}
|
||||
// Keep the status-only path conservative and behavior-preserving.
|
||||
// Message-path HTTP heuristics are broader and should not leak in here.
|
||||
if (status === 502 || status === 503 || status === 504) {
|
||||
if (status === 503) {
|
||||
if (message && isOverloadedErrorMessage(message)) {
|
||||
return "overloaded";
|
||||
}
|
||||
return "timeout";
|
||||
}
|
||||
if (status === 502 || status === 504) {
|
||||
return "timeout";
|
||||
}
|
||||
if (status === 529) {
|
||||
return "rate_limit";
|
||||
return "overloaded";
|
||||
}
|
||||
if (status === 400) {
|
||||
// Some providers return quota/balance errors under HTTP 400, so do not
|
||||
@@ -854,13 +858,6 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
|
||||
if (isModelNotFoundErrorMessage(raw)) {
|
||||
return "model_not_found";
|
||||
}
|
||||
if (isTransientHttpError(raw)) {
|
||||
// Treat transient 5xx provider failures as retryable transport issues.
|
||||
return "timeout";
|
||||
}
|
||||
if (isJsonApiInternalServerError(raw)) {
|
||||
return "timeout";
|
||||
}
|
||||
if (isPeriodicUsageLimitErrorMessage(raw)) {
|
||||
return isBillingErrorMessage(raw) ? "billing" : "rate_limit";
|
||||
}
|
||||
@@ -868,7 +865,19 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
|
||||
return "rate_limit";
|
||||
}
|
||||
if (isOverloadedErrorMessage(raw)) {
|
||||
return "rate_limit";
|
||||
return "overloaded";
|
||||
}
|
||||
if (isTransientHttpError(raw)) {
|
||||
// 529 is always overloaded, even without explicit overload keywords in the body.
|
||||
const status = extractLeadingHttpStatus(raw.trim());
|
||||
if (status?.code === 529) {
|
||||
return "overloaded";
|
||||
}
|
||||
// Treat remaining transient 5xx provider failures as retryable transport issues.
|
||||
return "timeout";
|
||||
}
|
||||
if (isJsonApiInternalServerError(raw)) {
|
||||
return "timeout";
|
||||
}
|
||||
if (isCloudCodeAssistFormatError(raw)) {
|
||||
return "format";
|
||||
|
||||
@@ -5,6 +5,7 @@ export type FailoverReason =
|
||||
| "auth_permanent"
|
||||
| "format"
|
||||
| "rate_limit"
|
||||
| "overloaded"
|
||||
| "billing"
|
||||
| "timeout"
|
||||
| "model_not_found"
|
||||
|
||||
Reference in New Issue
Block a user