Files
openclaw/src/agents/failover-error.ts
Protocol Zero 2af3415fac fix: treat HTTP 503 as failover-eligible for LLM provider errors (#21086)
* fix: treat HTTP 503 as failover-eligible for LLM provider errors

When LLM SDKs wrap 503 responses, the leading "503" prefix is lost
(e.g. Google Gemini returns "high demand" / "UNAVAILABLE" without a
numeric prefix). The existing isTransientHttpError only matches
messages starting with "503 ...", so these wrapped errors silently
skip failover — no profile rotation, no model fallback.

This patch closes that gap:

- resolveFailoverReasonFromError: map HTTP status 503 → rate_limit
  (covers structured error objects with a status field)
- ERROR_PATTERNS.overloaded: add /\b503\b/, "service unavailable",
  "high demand" (covers message-only classification when the leading
  status prefix is absent)

Existing isTransientHttpError behavior is unchanged; these additions
are complementary and only fire for errors that previously fell
through unclassified.

* fix: address review feedback — drop /\b503\b/ pattern, add test coverage

- Remove `/\b503\b/` from ERROR_PATTERNS.overloaded to resolve the
  semantic inconsistency noted by reviewers: `isTransientHttpError`
  already handles messages prefixed with "503" (→ "timeout"), so a
  redundant overloaded pattern would classify the same class of errors
  differently depending on message formatting.

- Keep "service unavailable" and "high demand" patterns — these are the
  real gap-fillers for SDK-rewritten messages that lack a numeric prefix.

- Add test case for JSON-wrapped 503 error body containing "overloaded"
  to strengthen coverage.

* fix: unify 503 classification — status 503 → timeout (consistent with isTransientHttpError)

resolveFailoverReasonFromError previously mapped status 503 → "rate_limit",
while the string-based isTransientHttpError mapped "503 ..." → "timeout".

Align both paths: structured {status: 503} now also returns "timeout",
matching the existing transient-error convention. Both reasons are
failover-eligible, so runtime behavior is unchanged.

---------

Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
2026-02-19 12:45:09 -08:00

239 lines
5.7 KiB
TypeScript

import { classifyFailoverReason, type FailoverReason } from "./pi-embedded-helpers.js";
const TIMEOUT_HINT_RE =
/timeout|timed out|deadline exceeded|context deadline exceeded|stop reason:\s*abort|reason:\s*abort|unhandled stop reason:\s*abort/i;
const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i;
export class FailoverError extends Error {
readonly reason: FailoverReason;
readonly provider?: string;
readonly model?: string;
readonly profileId?: string;
readonly status?: number;
readonly code?: string;
constructor(
message: string,
params: {
reason: FailoverReason;
provider?: string;
model?: string;
profileId?: string;
status?: number;
code?: string;
cause?: unknown;
},
) {
super(message, { cause: params.cause });
this.name = "FailoverError";
this.reason = params.reason;
this.provider = params.provider;
this.model = params.model;
this.profileId = params.profileId;
this.status = params.status;
this.code = params.code;
}
}
export function isFailoverError(err: unknown): err is FailoverError {
return err instanceof FailoverError;
}
export function resolveFailoverStatus(reason: FailoverReason): number | undefined {
switch (reason) {
case "billing":
return 402;
case "rate_limit":
return 429;
case "auth":
return 401;
case "timeout":
return 408;
case "format":
return 400;
default:
return undefined;
}
}
function getStatusCode(err: unknown): number | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
const candidate =
(err as { status?: unknown; statusCode?: unknown }).status ??
(err as { statusCode?: unknown }).statusCode;
if (typeof candidate === "number") {
return candidate;
}
if (typeof candidate === "string" && /^\d+$/.test(candidate)) {
return Number(candidate);
}
return undefined;
}
function getErrorName(err: unknown): string {
if (!err || typeof err !== "object") {
return "";
}
return "name" in err ? String(err.name) : "";
}
function getErrorCode(err: unknown): string | undefined {
if (!err || typeof err !== "object") {
return undefined;
}
const candidate = (err as { code?: unknown }).code;
if (typeof candidate !== "string") {
return undefined;
}
const trimmed = candidate.trim();
return trimmed ? trimmed : undefined;
}
function getErrorMessage(err: unknown): string {
if (err instanceof Error) {
return err.message;
}
if (typeof err === "string") {
return err;
}
if (typeof err === "number" || typeof err === "boolean" || typeof err === "bigint") {
return String(err);
}
if (typeof err === "symbol") {
return err.description ?? "";
}
if (err && typeof err === "object") {
const message = (err as { message?: unknown }).message;
if (typeof message === "string") {
return message;
}
}
return "";
}
function hasTimeoutHint(err: unknown): boolean {
if (!err) {
return false;
}
if (getErrorName(err) === "TimeoutError") {
return true;
}
const message = getErrorMessage(err);
return Boolean(message && TIMEOUT_HINT_RE.test(message));
}
export function isTimeoutError(err: unknown): boolean {
if (hasTimeoutHint(err)) {
return true;
}
if (!err || typeof err !== "object") {
return false;
}
if (getErrorName(err) !== "AbortError") {
return false;
}
const message = getErrorMessage(err);
if (message && ABORT_TIMEOUT_RE.test(message)) {
return true;
}
const cause = "cause" in err ? (err as { cause?: unknown }).cause : undefined;
const reason = "reason" in err ? (err as { reason?: unknown }).reason : undefined;
return hasTimeoutHint(cause) || hasTimeoutHint(reason);
}
export function resolveFailoverReasonFromError(err: unknown): FailoverReason | null {
if (isFailoverError(err)) {
return err.reason;
}
const status = getStatusCode(err);
if (status === 402) {
return "billing";
}
if (status === 429) {
return "rate_limit";
}
if (status === 401 || status === 403) {
return "auth";
}
if (status === 408) {
return "timeout";
}
if (status === 503) {
return "timeout";
}
if (status === 400) {
return "format";
}
const code = (getErrorCode(err) ?? "").toUpperCase();
if (["ETIMEDOUT", "ESOCKETTIMEDOUT", "ECONNRESET", "ECONNABORTED"].includes(code)) {
return "timeout";
}
if (isTimeoutError(err)) {
return "timeout";
}
const message = getErrorMessage(err);
if (!message) {
return null;
}
return classifyFailoverReason(message);
}
export function describeFailoverError(err: unknown): {
message: string;
reason?: FailoverReason;
status?: number;
code?: string;
} {
if (isFailoverError(err)) {
return {
message: err.message,
reason: err.reason,
status: err.status,
code: err.code,
};
}
const message = getErrorMessage(err) || String(err);
return {
message,
reason: resolveFailoverReasonFromError(err) ?? undefined,
status: getStatusCode(err),
code: getErrorCode(err),
};
}
export function coerceToFailoverError(
err: unknown,
context?: {
provider?: string;
model?: string;
profileId?: string;
},
): FailoverError | null {
if (isFailoverError(err)) {
return err;
}
const reason = resolveFailoverReasonFromError(err);
if (!reason) {
return null;
}
const message = getErrorMessage(err) || String(err);
const status = getStatusCode(err) ?? resolveFailoverStatus(reason);
const code = getErrorCode(err);
return new FailoverError(message, {
reason,
provider: context?.provider,
model: context?.model,
profileId: context?.profileId,
status,
code,
cause: err instanceof Error ? err : undefined,
});
}