mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-10 22:04:31 +00:00
fix(infra): prevent gateway crashes on transient network errors
This commit is contained in:
@@ -68,12 +68,14 @@ Status: unreleased.
|
|||||||
### Breaking
|
### Breaking
|
||||||
- **BREAKING:** Gateway auth mode "none" is removed; gateway now requires token/password (Tailscale Serve identity still allowed).
|
- **BREAKING:** Gateway auth mode "none" is removed; gateway now requires token/password (Tailscale Serve identity still allowed).
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
### Fixes
|
### Fixes
|
||||||
- Agents: prevent retries on oversized image errors and surface size limits. (#2871) Thanks @Suksham-sharma.
|
- Agents: prevent retries on oversized image errors and surface size limits. (#2871) Thanks @Suksham-sharma.
|
||||||
- Agents: inherit provider baseUrl/api for inline models. (#2740) Thanks @lploc94.
|
- Agents: inherit provider baseUrl/api for inline models. (#2740) Thanks @lploc94.
|
||||||
- Memory Search: keep auto provider model defaults and only include remote when configured. (#2576) Thanks @papago2355.
|
- Memory Search: keep auto provider model defaults and only include remote when configured. (#2576) Thanks @papago2355.
|
||||||
- macOS: auto-scroll to bottom when sending a new message while scrolled up. (#2471) Thanks @kennyklee.
|
- macOS: auto-scroll to bottom when sending a new message while scrolled up. (#2471) Thanks @kennyklee.
|
||||||
- Web UI: auto-expand the chat compose textarea while typing (with sensible max height). (#2950) Thanks @shivamraut101.
|
- Web UI: auto-expand the chat compose textarea while typing (with sensible max height). (#2950) Thanks @shivamraut101.
|
||||||
|
- Gateway: prevent crashes on transient network errors (fetch failures, timeouts, DNS). Added fatal error detection to only exit on truly critical errors. Fixes #2895, #2879, #2873. (#2980) Thanks @elliotsecops.
|
||||||
- Gateway: suppress AbortError and transient network errors in unhandled rejections. (#2451) Thanks @Glucksberg.
|
- Gateway: suppress AbortError and transient network errors in unhandled rejections. (#2451) Thanks @Glucksberg.
|
||||||
- TTS: keep /tts status replies on text-only commands and avoid duplicate block-stream audio. (#2451) Thanks @Glucksberg.
|
- TTS: keep /tts status replies on text-only commands and avoid duplicate block-stream audio. (#2451) Thanks @Glucksberg.
|
||||||
- Security: pin npm overrides to keep tar@7.5.4 for install toolchains.
|
- Security: pin npm overrides to keep tar@7.5.4 for install toolchains.
|
||||||
|
|||||||
159
src/infra/unhandled-rejections.fatal-detection.test.ts
Normal file
159
src/infra/unhandled-rejections.fatal-detection.test.ts
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
import { describe, it, expect, vi, beforeAll, afterAll, beforeEach, afterEach } from "vitest";
|
||||||
|
import process from "node:process";
|
||||||
|
|
||||||
|
import { installUnhandledRejectionHandler } from "./unhandled-rejections.js";
|
||||||
|
|
||||||
|
describe("installUnhandledRejectionHandler - fatal detection", () => {
|
||||||
|
let exitCalls: Array<string | number | null> = [];
|
||||||
|
let consoleErrorSpy: ReturnType<typeof vi.spyOn>;
|
||||||
|
let consoleWarnSpy: ReturnType<typeof vi.spyOn>;
|
||||||
|
let originalExit: typeof process.exit;
|
||||||
|
|
||||||
|
beforeAll(() => {
|
||||||
|
originalExit = process.exit;
|
||||||
|
installUnhandledRejectionHandler();
|
||||||
|
});
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
exitCalls = [];
|
||||||
|
|
||||||
|
vi.spyOn(process, "exit").mockImplementation((code: string | number | null | undefined) => {
|
||||||
|
if (code !== undefined && code !== null) {
|
||||||
|
exitCalls.push(code);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
consoleErrorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||||
|
consoleWarnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
vi.clearAllMocks();
|
||||||
|
consoleErrorSpy.mockRestore();
|
||||||
|
consoleWarnSpy.mockRestore();
|
||||||
|
});
|
||||||
|
|
||||||
|
afterAll(() => {
|
||||||
|
process.exit = originalExit;
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("fatal errors", () => {
|
||||||
|
it("exits on ERR_OUT_OF_MEMORY", () => {
|
||||||
|
const oomErr = Object.assign(new Error("Out of memory"), {
|
||||||
|
code: "ERR_OUT_OF_MEMORY",
|
||||||
|
});
|
||||||
|
|
||||||
|
process.emit("unhandledRejection", oomErr, Promise.resolve());
|
||||||
|
|
||||||
|
expect(exitCalls).toEqual([1]);
|
||||||
|
expect(consoleErrorSpy).toHaveBeenCalledWith(
|
||||||
|
"[clawdbot] FATAL unhandled rejection:",
|
||||||
|
expect.stringContaining("Out of memory"),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("exits on ERR_SCRIPT_EXECUTION_TIMEOUT", () => {
|
||||||
|
const timeoutErr = Object.assign(new Error("Script execution timeout"), {
|
||||||
|
code: "ERR_SCRIPT_EXECUTION_TIMEOUT",
|
||||||
|
});
|
||||||
|
|
||||||
|
process.emit("unhandledRejection", timeoutErr, Promise.resolve());
|
||||||
|
|
||||||
|
expect(exitCalls).toEqual([1]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("exits on ERR_WORKER_OUT_OF_MEMORY", () => {
|
||||||
|
const workerOomErr = Object.assign(new Error("Worker out of memory"), {
|
||||||
|
code: "ERR_WORKER_OUT_OF_MEMORY",
|
||||||
|
});
|
||||||
|
|
||||||
|
process.emit("unhandledRejection", workerOomErr, Promise.resolve());
|
||||||
|
|
||||||
|
expect(exitCalls).toEqual([1]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("configuration errors", () => {
|
||||||
|
it("exits on INVALID_CONFIG", () => {
|
||||||
|
const configErr = Object.assign(new Error("Invalid config"), {
|
||||||
|
code: "INVALID_CONFIG",
|
||||||
|
});
|
||||||
|
|
||||||
|
process.emit("unhandledRejection", configErr, Promise.resolve());
|
||||||
|
|
||||||
|
expect(exitCalls).toEqual([1]);
|
||||||
|
expect(consoleErrorSpy).toHaveBeenCalledWith(
|
||||||
|
"[clawdbot] CONFIGURATION ERROR - requires fix:",
|
||||||
|
expect.stringContaining("Invalid config"),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("exits on MISSING_API_KEY", () => {
|
||||||
|
const missingKeyErr = Object.assign(new Error("Missing API key"), {
|
||||||
|
code: "MISSING_API_KEY",
|
||||||
|
});
|
||||||
|
|
||||||
|
process.emit("unhandledRejection", missingKeyErr, Promise.resolve());
|
||||||
|
|
||||||
|
expect(exitCalls).toEqual([1]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("non-fatal errors", () => {
|
||||||
|
it("does NOT exit on undici fetch failures", () => {
|
||||||
|
const fetchErr = Object.assign(new TypeError("fetch failed"), {
|
||||||
|
cause: { code: "UND_ERR_CONNECT_TIMEOUT", syscall: "connect" },
|
||||||
|
});
|
||||||
|
|
||||||
|
process.emit("unhandledRejection", fetchErr, Promise.resolve());
|
||||||
|
|
||||||
|
expect(exitCalls).toEqual([]);
|
||||||
|
expect(consoleWarnSpy).toHaveBeenCalledWith(
|
||||||
|
"[clawdbot] Non-fatal unhandled rejection (continuing):",
|
||||||
|
expect.stringContaining("fetch failed"),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does NOT exit on DNS resolution failures", () => {
|
||||||
|
const dnsErr = Object.assign(new Error("DNS resolve failed"), {
|
||||||
|
code: "UND_ERR_DNS_RESOLVE_FAILED",
|
||||||
|
});
|
||||||
|
|
||||||
|
process.emit("unhandledRejection", dnsErr, Promise.resolve());
|
||||||
|
|
||||||
|
expect(exitCalls).toEqual([]);
|
||||||
|
expect(consoleWarnSpy).toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does NOT exit on generic errors without code", () => {
|
||||||
|
const genericErr = new Error("Something went wrong");
|
||||||
|
|
||||||
|
process.emit("unhandledRejection", genericErr, Promise.resolve());
|
||||||
|
|
||||||
|
expect(exitCalls).toEqual([]);
|
||||||
|
expect(consoleWarnSpy).toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does NOT exit on connection reset errors", () => {
|
||||||
|
const connResetErr = Object.assign(new Error("Connection reset"), {
|
||||||
|
code: "ECONNRESET",
|
||||||
|
});
|
||||||
|
|
||||||
|
process.emit("unhandledRejection", connResetErr, Promise.resolve());
|
||||||
|
|
||||||
|
expect(exitCalls).toEqual([]);
|
||||||
|
expect(consoleWarnSpy).toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does NOT exit on timeout errors", () => {
|
||||||
|
const timeoutErr = Object.assign(new Error("Timeout"), {
|
||||||
|
code: "ETIMEDOUT",
|
||||||
|
});
|
||||||
|
|
||||||
|
process.emit("unhandledRejection", timeoutErr, Promise.resolve());
|
||||||
|
|
||||||
|
expect(exitCalls).toEqual([]);
|
||||||
|
expect(consoleWarnSpy).toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -1,11 +1,56 @@
|
|||||||
import process from "node:process";
|
import process from "node:process";
|
||||||
|
|
||||||
import { formatUncaughtError } from "./errors.js";
|
import { extractErrorCode, formatUncaughtError } from "./errors.js";
|
||||||
|
|
||||||
type UnhandledRejectionHandler = (reason: unknown) => boolean;
|
type UnhandledRejectionHandler = (reason: unknown) => boolean;
|
||||||
|
|
||||||
const handlers = new Set<UnhandledRejectionHandler>();
|
const handlers = new Set<UnhandledRejectionHandler>();
|
||||||
|
|
||||||
|
const FATAL_ERROR_CODES = new Set([
|
||||||
|
"ERR_OUT_OF_MEMORY",
|
||||||
|
"ERR_SCRIPT_EXECUTION_TIMEOUT",
|
||||||
|
"ERR_WORKER_OUT_OF_MEMORY",
|
||||||
|
"ERR_WORKER_UNCAUGHT_EXCEPTION",
|
||||||
|
"ERR_WORKER_INITIALIZATION_FAILED",
|
||||||
|
]);
|
||||||
|
|
||||||
|
const CONFIG_ERROR_CODES = new Set([
|
||||||
|
"INVALID_CONFIG",
|
||||||
|
"MISSING_API_KEY",
|
||||||
|
"MISSING_CREDENTIALS",
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Network error codes that indicate transient failures (shouldn't crash the gateway)
|
||||||
|
const TRANSIENT_NETWORK_CODES = new Set([
|
||||||
|
"ECONNRESET",
|
||||||
|
"ECONNREFUSED",
|
||||||
|
"ENOTFOUND",
|
||||||
|
"ETIMEDOUT",
|
||||||
|
"ESOCKETTIMEDOUT",
|
||||||
|
"ECONNABORTED",
|
||||||
|
"EPIPE",
|
||||||
|
"EHOSTUNREACH",
|
||||||
|
"ENETUNREACH",
|
||||||
|
"EAI_AGAIN",
|
||||||
|
"UND_ERR_CONNECT_TIMEOUT",
|
||||||
|
"UND_ERR_DNS_RESOLVE_FAILED",
|
||||||
|
"UND_ERR_CONNECT",
|
||||||
|
"UND_ERR_SOCKET",
|
||||||
|
"UND_ERR_HEADERS_TIMEOUT",
|
||||||
|
"UND_ERR_BODY_TIMEOUT",
|
||||||
|
]);
|
||||||
|
|
||||||
|
function getErrorCause(err: unknown): unknown {
|
||||||
|
if (!err || typeof err !== "object") return undefined;
|
||||||
|
return (err as { cause?: unknown }).cause;
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractErrorCodeWithCause(err: unknown): string | undefined {
|
||||||
|
const direct = extractErrorCode(err);
|
||||||
|
if (direct) return direct;
|
||||||
|
return extractErrorCode(getErrorCause(err));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if an error is an AbortError.
|
* Checks if an error is an AbortError.
|
||||||
* These are typically intentional cancellations (e.g., during shutdown) and shouldn't crash.
|
* These are typically intentional cancellations (e.g., during shutdown) and shouldn't crash.
|
||||||
@@ -20,33 +65,14 @@ export function isAbortError(err: unknown): boolean {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Network error codes that indicate transient failures (shouldn't crash the gateway)
|
function isFatalError(err: unknown): boolean {
|
||||||
const TRANSIENT_NETWORK_CODES = new Set([
|
const code = extractErrorCodeWithCause(err);
|
||||||
"ECONNRESET",
|
return code !== undefined && FATAL_ERROR_CODES.has(code);
|
||||||
"ECONNREFUSED",
|
|
||||||
"ENOTFOUND",
|
|
||||||
"ETIMEDOUT",
|
|
||||||
"ESOCKETTIMEDOUT",
|
|
||||||
"ECONNABORTED",
|
|
||||||
"EPIPE",
|
|
||||||
"EHOSTUNREACH",
|
|
||||||
"ENETUNREACH",
|
|
||||||
"EAI_AGAIN",
|
|
||||||
"UND_ERR_CONNECT_TIMEOUT",
|
|
||||||
"UND_ERR_SOCKET",
|
|
||||||
"UND_ERR_HEADERS_TIMEOUT",
|
|
||||||
"UND_ERR_BODY_TIMEOUT",
|
|
||||||
]);
|
|
||||||
|
|
||||||
function getErrorCode(err: unknown): string | undefined {
|
|
||||||
if (!err || typeof err !== "object") return undefined;
|
|
||||||
const code = (err as { code?: unknown }).code;
|
|
||||||
return typeof code === "string" ? code : undefined;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function getErrorCause(err: unknown): unknown {
|
function isConfigError(err: unknown): boolean {
|
||||||
if (!err || typeof err !== "object") return undefined;
|
const code = extractErrorCodeWithCause(err);
|
||||||
return (err as { cause?: unknown }).cause;
|
return code !== undefined && CONFIG_ERROR_CODES.has(code);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -56,16 +82,13 @@ function getErrorCause(err: unknown): unknown {
|
|||||||
export function isTransientNetworkError(err: unknown): boolean {
|
export function isTransientNetworkError(err: unknown): boolean {
|
||||||
if (!err) return false;
|
if (!err) return false;
|
||||||
|
|
||||||
// Check the error itself
|
const code = extractErrorCodeWithCause(err);
|
||||||
const code = getErrorCode(err);
|
|
||||||
if (code && TRANSIENT_NETWORK_CODES.has(code)) return true;
|
if (code && TRANSIENT_NETWORK_CODES.has(code)) return true;
|
||||||
|
|
||||||
// "fetch failed" TypeError from undici (Node's native fetch)
|
// "fetch failed" TypeError from undici (Node's native fetch)
|
||||||
if (err instanceof TypeError && err.message === "fetch failed") {
|
if (err instanceof TypeError && err.message === "fetch failed") {
|
||||||
const cause = getErrorCause(err);
|
const cause = getErrorCause(err);
|
||||||
// The cause often contains the actual network error
|
|
||||||
if (cause) return isTransientNetworkError(cause);
|
if (cause) return isTransientNetworkError(cause);
|
||||||
// Even without a cause, "fetch failed" is typically a network issue
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -115,10 +138,26 @@ export function installUnhandledRejectionHandler(): void {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Transient network errors (fetch failed, connection reset, etc.) shouldn't crash
|
if (isFatalError(reason)) {
|
||||||
// These are temporary connectivity issues that will resolve on their own
|
console.error("[moltbot] FATAL unhandled rejection:", formatUncaughtError(reason));
|
||||||
|
process.exit(1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isConfigError(reason)) {
|
||||||
|
console.error(
|
||||||
|
"[moltbot] CONFIGURATION ERROR - requires fix:",
|
||||||
|
formatUncaughtError(reason),
|
||||||
|
);
|
||||||
|
process.exit(1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (isTransientNetworkError(reason)) {
|
if (isTransientNetworkError(reason)) {
|
||||||
console.error("[moltbot] Network error (non-fatal):", formatUncaughtError(reason));
|
console.warn(
|
||||||
|
"[moltbot] Non-fatal unhandled rejection (continuing):",
|
||||||
|
formatUncaughtError(reason),
|
||||||
|
);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user