mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-09 03:28:29 +00:00
fix: treat HTTP 502/503/504 as failover-eligible (timeout reason) (#21017)
* fix: treat HTTP 502/503/504 as failover-eligible (timeout reason) When a model API returns 502 Bad Gateway, 503 Service Unavailable, or 504 Gateway Timeout, the error object carries the status code directly. resolveFailoverReasonFromError() only checked 402/429/401/403/408/400, so 5xx server errors fell through to message-based classification which requires the status code to appear at the start of the error message. Many API SDKs (Google, Anthropic) set err.status = 503 without prefixing the message with '503', so the message classifier never matched and failover never triggered — the run retried the same broken model. Add 502/503/504 to the status-code branch, returning 'timeout' (matching the existing behavior of isTransientHttpError in the message classifier). Fixes #20999 * Changelog: add failover 502/503/504 note with credits * Failover: classify HTTP 504 as transient in message parser * Changelog: credit taw0002 and vincentkoc for failover fix --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
@@ -13,7 +13,10 @@ describe("failover-error", () => {
|
||||
expect(resolveFailoverReasonFromError({ status: 403 })).toBe("auth");
|
||||
expect(resolveFailoverReasonFromError({ status: 408 })).toBe("timeout");
|
||||
expect(resolveFailoverReasonFromError({ status: 400 })).toBe("format");
|
||||
// Transient server errors (502/503/504) should trigger failover as timeout.
|
||||
expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout");
|
||||
expect(resolveFailoverReasonFromError({ status: 503 })).toBe("timeout");
|
||||
expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout");
|
||||
});
|
||||
|
||||
it("infers format errors from error messages", () => {
|
||||
|
||||
@@ -163,7 +163,7 @@ export function resolveFailoverReasonFromError(err: unknown): FailoverReason | n
|
||||
if (status === 408) {
|
||||
return "timeout";
|
||||
}
|
||||
if (status === 503) {
|
||||
if (status === 502 || status === 503 || status === 504) {
|
||||
return "timeout";
|
||||
}
|
||||
if (status === 400) {
|
||||
|
||||
@@ -270,12 +270,12 @@ describe("isTransientHttpError", () => {
|
||||
expect(isTransientHttpError("500 Internal Server Error")).toBe(true);
|
||||
expect(isTransientHttpError("502 Bad Gateway")).toBe(true);
|
||||
expect(isTransientHttpError("503 Service Unavailable")).toBe(true);
|
||||
expect(isTransientHttpError("504 Gateway Timeout")).toBe(true);
|
||||
expect(isTransientHttpError("521 <!DOCTYPE html><html></html>")).toBe(true);
|
||||
expect(isTransientHttpError("529 Overloaded")).toBe(true);
|
||||
});
|
||||
|
||||
it("returns false for non-retryable or non-http text", () => {
|
||||
expect(isTransientHttpError("504 Gateway Timeout")).toBe(false);
|
||||
expect(isTransientHttpError("429 Too Many Requests")).toBe(false);
|
||||
expect(isTransientHttpError("network timeout")).toBe(false);
|
||||
});
|
||||
|
||||
@@ -120,7 +120,7 @@ const HTTP_STATUS_PREFIX_RE = /^(?:http\s*)?(\d{3})\s+(.+)$/i;
|
||||
const HTTP_STATUS_CODE_PREFIX_RE = /^(?:http\s*)?(\d{3})(?:\s+([\s\S]+))?$/i;
|
||||
const HTML_ERROR_PREFIX_RE = /^\s*(?:<!doctype\s+html\b|<html\b)/i;
|
||||
const CLOUDFLARE_HTML_ERROR_CODES = new Set([521, 522, 523, 524, 525, 526, 530]);
|
||||
const TRANSIENT_HTTP_ERROR_CODES = new Set([500, 502, 503, 521, 522, 523, 524, 529]);
|
||||
const TRANSIENT_HTTP_ERROR_CODES = new Set([500, 502, 503, 504, 521, 522, 523, 524, 529]);
|
||||
const HTTP_ERROR_HINTS = [
|
||||
"error",
|
||||
"bad request",
|
||||
|
||||
Reference in New Issue
Block a user