fix(infra): prevent gateway crashes on transient network errors

This commit is contained in:
elliotsecops
2026-01-27 14:43:42 -04:00
committed by Shadow
parent 3bf768ab07
commit 3b879fe524
3 changed files with 233 additions and 33 deletions

View File

@@ -0,0 +1,159 @@
import { describe, it, expect, vi, beforeAll, afterAll, beforeEach, afterEach } from "vitest";
import process from "node:process";
import { installUnhandledRejectionHandler } from "./unhandled-rejections.js";
describe("installUnhandledRejectionHandler - fatal detection", () => {
let exitCalls: Array<string | number | null> = [];
let consoleErrorSpy: ReturnType<typeof vi.spyOn>;
let consoleWarnSpy: ReturnType<typeof vi.spyOn>;
let originalExit: typeof process.exit;
beforeAll(() => {
originalExit = process.exit;
installUnhandledRejectionHandler();
});
beforeEach(() => {
exitCalls = [];
vi.spyOn(process, "exit").mockImplementation((code: string | number | null | undefined) => {
if (code !== undefined && code !== null) {
exitCalls.push(code);
}
});
consoleErrorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
consoleWarnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
});
afterEach(() => {
vi.clearAllMocks();
consoleErrorSpy.mockRestore();
consoleWarnSpy.mockRestore();
});
afterAll(() => {
process.exit = originalExit;
});
describe("fatal errors", () => {
it("exits on ERR_OUT_OF_MEMORY", () => {
const oomErr = Object.assign(new Error("Out of memory"), {
code: "ERR_OUT_OF_MEMORY",
});
process.emit("unhandledRejection", oomErr, Promise.resolve());
expect(exitCalls).toEqual([1]);
expect(consoleErrorSpy).toHaveBeenCalledWith(
"[clawdbot] FATAL unhandled rejection:",
expect.stringContaining("Out of memory"),
);
});
it("exits on ERR_SCRIPT_EXECUTION_TIMEOUT", () => {
const timeoutErr = Object.assign(new Error("Script execution timeout"), {
code: "ERR_SCRIPT_EXECUTION_TIMEOUT",
});
process.emit("unhandledRejection", timeoutErr, Promise.resolve());
expect(exitCalls).toEqual([1]);
});
it("exits on ERR_WORKER_OUT_OF_MEMORY", () => {
const workerOomErr = Object.assign(new Error("Worker out of memory"), {
code: "ERR_WORKER_OUT_OF_MEMORY",
});
process.emit("unhandledRejection", workerOomErr, Promise.resolve());
expect(exitCalls).toEqual([1]);
});
});
describe("configuration errors", () => {
it("exits on INVALID_CONFIG", () => {
const configErr = Object.assign(new Error("Invalid config"), {
code: "INVALID_CONFIG",
});
process.emit("unhandledRejection", configErr, Promise.resolve());
expect(exitCalls).toEqual([1]);
expect(consoleErrorSpy).toHaveBeenCalledWith(
"[clawdbot] CONFIGURATION ERROR - requires fix:",
expect.stringContaining("Invalid config"),
);
});
it("exits on MISSING_API_KEY", () => {
const missingKeyErr = Object.assign(new Error("Missing API key"), {
code: "MISSING_API_KEY",
});
process.emit("unhandledRejection", missingKeyErr, Promise.resolve());
expect(exitCalls).toEqual([1]);
});
});
describe("non-fatal errors", () => {
it("does NOT exit on undici fetch failures", () => {
const fetchErr = Object.assign(new TypeError("fetch failed"), {
cause: { code: "UND_ERR_CONNECT_TIMEOUT", syscall: "connect" },
});
process.emit("unhandledRejection", fetchErr, Promise.resolve());
expect(exitCalls).toEqual([]);
expect(consoleWarnSpy).toHaveBeenCalledWith(
"[clawdbot] Non-fatal unhandled rejection (continuing):",
expect.stringContaining("fetch failed"),
);
});
it("does NOT exit on DNS resolution failures", () => {
const dnsErr = Object.assign(new Error("DNS resolve failed"), {
code: "UND_ERR_DNS_RESOLVE_FAILED",
});
process.emit("unhandledRejection", dnsErr, Promise.resolve());
expect(exitCalls).toEqual([]);
expect(consoleWarnSpy).toHaveBeenCalled();
});
it("does NOT exit on generic errors without code", () => {
const genericErr = new Error("Something went wrong");
process.emit("unhandledRejection", genericErr, Promise.resolve());
expect(exitCalls).toEqual([]);
expect(consoleWarnSpy).toHaveBeenCalled();
});
it("does NOT exit on connection reset errors", () => {
const connResetErr = Object.assign(new Error("Connection reset"), {
code: "ECONNRESET",
});
process.emit("unhandledRejection", connResetErr, Promise.resolve());
expect(exitCalls).toEqual([]);
expect(consoleWarnSpy).toHaveBeenCalled();
});
it("does NOT exit on timeout errors", () => {
const timeoutErr = Object.assign(new Error("Timeout"), {
code: "ETIMEDOUT",
});
process.emit("unhandledRejection", timeoutErr, Promise.resolve());
expect(exitCalls).toEqual([]);
expect(consoleWarnSpy).toHaveBeenCalled();
});
});
});

View File

@@ -1,11 +1,56 @@
import process from "node:process";
import { formatUncaughtError } from "./errors.js";
import { extractErrorCode, formatUncaughtError } from "./errors.js";
type UnhandledRejectionHandler = (reason: unknown) => boolean;
const handlers = new Set<UnhandledRejectionHandler>();
const FATAL_ERROR_CODES = new Set([
"ERR_OUT_OF_MEMORY",
"ERR_SCRIPT_EXECUTION_TIMEOUT",
"ERR_WORKER_OUT_OF_MEMORY",
"ERR_WORKER_UNCAUGHT_EXCEPTION",
"ERR_WORKER_INITIALIZATION_FAILED",
]);
const CONFIG_ERROR_CODES = new Set([
"INVALID_CONFIG",
"MISSING_API_KEY",
"MISSING_CREDENTIALS",
]);
// Network error codes that indicate transient failures (shouldn't crash the gateway)
const TRANSIENT_NETWORK_CODES = new Set([
"ECONNRESET",
"ECONNREFUSED",
"ENOTFOUND",
"ETIMEDOUT",
"ESOCKETTIMEDOUT",
"ECONNABORTED",
"EPIPE",
"EHOSTUNREACH",
"ENETUNREACH",
"EAI_AGAIN",
"UND_ERR_CONNECT_TIMEOUT",
"UND_ERR_DNS_RESOLVE_FAILED",
"UND_ERR_CONNECT",
"UND_ERR_SOCKET",
"UND_ERR_HEADERS_TIMEOUT",
"UND_ERR_BODY_TIMEOUT",
]);
function getErrorCause(err: unknown): unknown {
if (!err || typeof err !== "object") return undefined;
return (err as { cause?: unknown }).cause;
}
function extractErrorCodeWithCause(err: unknown): string | undefined {
const direct = extractErrorCode(err);
if (direct) return direct;
return extractErrorCode(getErrorCause(err));
}
/**
* Checks if an error is an AbortError.
* These are typically intentional cancellations (e.g., during shutdown) and shouldn't crash.
@@ -20,33 +65,14 @@ export function isAbortError(err: unknown): boolean {
return false;
}
// Network error codes that indicate transient failures (shouldn't crash the gateway)
const TRANSIENT_NETWORK_CODES = new Set([
"ECONNRESET",
"ECONNREFUSED",
"ENOTFOUND",
"ETIMEDOUT",
"ESOCKETTIMEDOUT",
"ECONNABORTED",
"EPIPE",
"EHOSTUNREACH",
"ENETUNREACH",
"EAI_AGAIN",
"UND_ERR_CONNECT_TIMEOUT",
"UND_ERR_SOCKET",
"UND_ERR_HEADERS_TIMEOUT",
"UND_ERR_BODY_TIMEOUT",
]);
function getErrorCode(err: unknown): string | undefined {
if (!err || typeof err !== "object") return undefined;
const code = (err as { code?: unknown }).code;
return typeof code === "string" ? code : undefined;
function isFatalError(err: unknown): boolean {
const code = extractErrorCodeWithCause(err);
return code !== undefined && FATAL_ERROR_CODES.has(code);
}
function getErrorCause(err: unknown): unknown {
if (!err || typeof err !== "object") return undefined;
return (err as { cause?: unknown }).cause;
function isConfigError(err: unknown): boolean {
const code = extractErrorCodeWithCause(err);
return code !== undefined && CONFIG_ERROR_CODES.has(code);
}
/**
@@ -56,16 +82,13 @@ function getErrorCause(err: unknown): unknown {
export function isTransientNetworkError(err: unknown): boolean {
if (!err) return false;
// Check the error itself
const code = getErrorCode(err);
const code = extractErrorCodeWithCause(err);
if (code && TRANSIENT_NETWORK_CODES.has(code)) return true;
// "fetch failed" TypeError from undici (Node's native fetch)
if (err instanceof TypeError && err.message === "fetch failed") {
const cause = getErrorCause(err);
// The cause often contains the actual network error
if (cause) return isTransientNetworkError(cause);
// Even without a cause, "fetch failed" is typically a network issue
return true;
}
@@ -115,10 +138,26 @@ export function installUnhandledRejectionHandler(): void {
return;
}
// Transient network errors (fetch failed, connection reset, etc.) shouldn't crash
// These are temporary connectivity issues that will resolve on their own
if (isFatalError(reason)) {
console.error("[moltbot] FATAL unhandled rejection:", formatUncaughtError(reason));
process.exit(1);
return;
}
if (isConfigError(reason)) {
console.error(
"[moltbot] CONFIGURATION ERROR - requires fix:",
formatUncaughtError(reason),
);
process.exit(1);
return;
}
if (isTransientNetworkError(reason)) {
console.error("[moltbot] Network error (non-fatal):", formatUncaughtError(reason));
console.warn(
"[moltbot] Non-fatal unhandled rejection (continuing):",
formatUncaughtError(reason),
);
return;
}