fix(gateway): gracefully handle AbortError and transient network errors (#2451)

* fix(tts): generate audio when block streaming drops final reply

When block streaming succeeds, final replies are dropped but TTS was only
applied to final replies. Fix by accumulating block text during streaming
and generating TTS-only audio after streaming completes.

Also:
- Change truncate vs skip behavior when summary OFF (now truncates)
- Align TTS limits with Telegram max (4096 chars)
- Improve /tts command help messages with examples
- Add newline separator between accumulated blocks

* fix(tts): add error handling for accumulated block TTS

* feat(tts): add descriptive inline menu with action descriptions

- Add value/label support for command arg choices
- TTS menu now shows descriptive title listing each action
- Capitalize button labels (On, Off, Status, etc.)
- Update Telegram, Discord, and Slack handlers to use labels

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(gateway): gracefully handle AbortError and transient network errors

Addresses issues #1851, #1997, and #2034.

During config reload (SIGUSR1), in-flight requests are aborted, causing
AbortError exceptions. Similarly, transient network errors (fetch failed,
ECONNRESET, ETIMEDOUT, etc.) can crash the gateway unnecessarily.

This change:
- Adds isAbortError() to detect intentional cancellations
- Adds isTransientNetworkError() to detect temporary connectivity issues
- Logs these errors appropriately instead of crashing
- Handles nested cause chains and AggregateError

AbortError is logged as a warning (expected during shutdown).
Network errors are logged as non-fatal errors (will resolve on their own).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(test): update commands-registry test expectations

Update test expectations to match new ResolvedCommandArgChoice format
(choices now return {label, value} objects instead of plain strings).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix: harden unhandled rejection handling and tts menus (#2451) (thanks @Glucksberg)

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
Co-authored-by: Shadow <hi@shadowing.dev>
This commit is contained in:
Glucksberg
2026-01-26 21:51:53 -04:00
committed by GitHub
parent d8e5dd91ba
commit 481bd333eb
14 changed files with 487 additions and 159 deletions

View File

@@ -0,0 +1,129 @@
import { describe, expect, it } from "vitest";
import { isAbortError, isTransientNetworkError } from "./unhandled-rejections.js";
describe("isAbortError", () => {
it("returns true for error with name AbortError", () => {
const error = new Error("aborted");
error.name = "AbortError";
expect(isAbortError(error)).toBe(true);
});
it('returns true for error with "This operation was aborted" message', () => {
const error = new Error("This operation was aborted");
expect(isAbortError(error)).toBe(true);
});
it("returns true for undici-style AbortError", () => {
// Node's undici throws errors with this exact message
const error = Object.assign(new Error("This operation was aborted"), { name: "AbortError" });
expect(isAbortError(error)).toBe(true);
});
it("returns true for object with AbortError name", () => {
expect(isAbortError({ name: "AbortError", message: "test" })).toBe(true);
});
it("returns false for regular errors", () => {
expect(isAbortError(new Error("Something went wrong"))).toBe(false);
expect(isAbortError(new TypeError("Cannot read property"))).toBe(false);
expect(isAbortError(new RangeError("Invalid array length"))).toBe(false);
});
it("returns false for errors with similar but different messages", () => {
expect(isAbortError(new Error("Operation aborted"))).toBe(false);
expect(isAbortError(new Error("aborted"))).toBe(false);
expect(isAbortError(new Error("Request was aborted"))).toBe(false);
});
it("returns false for null and undefined", () => {
expect(isAbortError(null)).toBe(false);
expect(isAbortError(undefined)).toBe(false);
});
it("returns false for non-error values", () => {
expect(isAbortError("string error")).toBe(false);
expect(isAbortError(42)).toBe(false);
});
it("returns false for plain objects without AbortError name", () => {
expect(isAbortError({ message: "plain object" })).toBe(false);
});
});
describe("isTransientNetworkError", () => {
it("returns true for errors with transient network codes", () => {
const codes = [
"ECONNRESET",
"ECONNREFUSED",
"ENOTFOUND",
"ETIMEDOUT",
"ESOCKETTIMEDOUT",
"ECONNABORTED",
"EPIPE",
"EHOSTUNREACH",
"ENETUNREACH",
"EAI_AGAIN",
"UND_ERR_CONNECT_TIMEOUT",
"UND_ERR_SOCKET",
"UND_ERR_HEADERS_TIMEOUT",
"UND_ERR_BODY_TIMEOUT",
];
for (const code of codes) {
const error = Object.assign(new Error("test"), { code });
expect(isTransientNetworkError(error), `code: ${code}`).toBe(true);
}
});
it('returns true for TypeError with "fetch failed" message', () => {
const error = new TypeError("fetch failed");
expect(isTransientNetworkError(error)).toBe(true);
});
it("returns true for fetch failed with network cause", () => {
const cause = Object.assign(new Error("getaddrinfo ENOTFOUND"), { code: "ENOTFOUND" });
const error = Object.assign(new TypeError("fetch failed"), { cause });
expect(isTransientNetworkError(error)).toBe(true);
});
it("returns true for nested cause chain with network error", () => {
const innerCause = Object.assign(new Error("connection reset"), { code: "ECONNRESET" });
const outerCause = Object.assign(new Error("wrapper"), { cause: innerCause });
const error = Object.assign(new TypeError("fetch failed"), { cause: outerCause });
expect(isTransientNetworkError(error)).toBe(true);
});
it("returns true for AggregateError containing network errors", () => {
const networkError = Object.assign(new Error("timeout"), { code: "ETIMEDOUT" });
const error = new AggregateError([networkError], "Multiple errors");
expect(isTransientNetworkError(error)).toBe(true);
});
it("returns false for regular errors without network codes", () => {
expect(isTransientNetworkError(new Error("Something went wrong"))).toBe(false);
expect(isTransientNetworkError(new TypeError("Cannot read property"))).toBe(false);
expect(isTransientNetworkError(new RangeError("Invalid array length"))).toBe(false);
});
it("returns false for errors with non-network codes", () => {
const error = Object.assign(new Error("test"), { code: "INVALID_CONFIG" });
expect(isTransientNetworkError(error)).toBe(false);
});
it("returns false for null and undefined", () => {
expect(isTransientNetworkError(null)).toBe(false);
expect(isTransientNetworkError(undefined)).toBe(false);
});
it("returns false for non-error values", () => {
expect(isTransientNetworkError("string error")).toBe(false);
expect(isTransientNetworkError(42)).toBe(false);
expect(isTransientNetworkError({ message: "plain object" })).toBe(false);
});
it("returns false for AggregateError with only non-network errors", () => {
const error = new AggregateError([new Error("regular error")], "Multiple errors");
expect(isTransientNetworkError(error)).toBe(false);
});
});

View File

@@ -1,11 +1,88 @@
import process from "node:process";
import { formatErrorMessage, formatUncaughtError } from "./errors.js";
import { formatUncaughtError } from "./errors.js";
type UnhandledRejectionHandler = (reason: unknown) => boolean;
const handlers = new Set<UnhandledRejectionHandler>();
/**
* Checks if an error is an AbortError.
* These are typically intentional cancellations (e.g., during shutdown) and shouldn't crash.
*/
export function isAbortError(err: unknown): boolean {
if (!err || typeof err !== "object") return false;
const name = "name" in err ? String(err.name) : "";
if (name === "AbortError") return true;
// Check for "This operation was aborted" message from Node's undici
const message = "message" in err && typeof err.message === "string" ? err.message : "";
if (message === "This operation was aborted") return true;
return false;
}
// Network error codes that indicate transient failures (shouldn't crash the gateway)
const TRANSIENT_NETWORK_CODES = new Set([
"ECONNRESET",
"ECONNREFUSED",
"ENOTFOUND",
"ETIMEDOUT",
"ESOCKETTIMEDOUT",
"ECONNABORTED",
"EPIPE",
"EHOSTUNREACH",
"ENETUNREACH",
"EAI_AGAIN",
"UND_ERR_CONNECT_TIMEOUT",
"UND_ERR_SOCKET",
"UND_ERR_HEADERS_TIMEOUT",
"UND_ERR_BODY_TIMEOUT",
]);
function getErrorCode(err: unknown): string | undefined {
if (!err || typeof err !== "object") return undefined;
const code = (err as { code?: unknown }).code;
return typeof code === "string" ? code : undefined;
}
function getErrorCause(err: unknown): unknown {
if (!err || typeof err !== "object") return undefined;
return (err as { cause?: unknown }).cause;
}
/**
* Checks if an error is a transient network error that shouldn't crash the gateway.
* These are typically temporary connectivity issues that will resolve on their own.
*/
export function isTransientNetworkError(err: unknown): boolean {
if (!err) return false;
// Check the error itself
const code = getErrorCode(err);
if (code && TRANSIENT_NETWORK_CODES.has(code)) return true;
// "fetch failed" TypeError from undici (Node's native fetch)
if (err instanceof TypeError && err.message === "fetch failed") {
const cause = getErrorCause(err);
// The cause often contains the actual network error
if (cause) return isTransientNetworkError(cause);
// Even without a cause, "fetch failed" is typically a network issue
return true;
}
// Check the cause chain recursively
const cause = getErrorCause(err);
if (cause && cause !== err) {
return isTransientNetworkError(cause);
}
// AggregateError may wrap multiple causes
if (err instanceof AggregateError && err.errors?.length) {
return err.errors.some((e) => isTransientNetworkError(e));
}
return false;
}
export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHandler): () => void {
handlers.add(handler);
return () => {
@@ -13,36 +90,6 @@ export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHan
};
}
/**
* Check if an error is a recoverable/transient error that shouldn't crash the process.
* These include network errors and abort signals during shutdown.
*/
function isRecoverableError(reason: unknown): boolean {
if (!reason) return false;
// Check error name for AbortError
if (reason instanceof Error && reason.name === "AbortError") {
return true;
}
const message = reason instanceof Error ? reason.message : formatErrorMessage(reason);
const lowerMessage = message.toLowerCase();
return (
lowerMessage.includes("fetch failed") ||
lowerMessage.includes("network request") ||
lowerMessage.includes("econnrefused") ||
lowerMessage.includes("econnreset") ||
lowerMessage.includes("etimedout") ||
lowerMessage.includes("socket hang up") ||
lowerMessage.includes("enotfound") ||
lowerMessage.includes("network error") ||
lowerMessage.includes("getaddrinfo") ||
lowerMessage.includes("client network socket disconnected") ||
lowerMessage.includes("this operation was aborted") ||
lowerMessage.includes("aborted")
);
}
export function isUnhandledRejectionHandled(reason: unknown): boolean {
for (const handler of handlers) {
try {
@@ -61,9 +108,17 @@ export function installUnhandledRejectionHandler(): void {
process.on("unhandledRejection", (reason, _promise) => {
if (isUnhandledRejectionHandled(reason)) return;
// Don't crash on recoverable/transient errors - log them and continue
if (isRecoverableError(reason)) {
console.error("[clawdbot] Recoverable error (not crashing):", formatUncaughtError(reason));
// AbortError is typically an intentional cancellation (e.g., during shutdown)
// Log it but don't crash - these are expected during graceful shutdown
if (isAbortError(reason)) {
console.warn("[clawdbot] Suppressed AbortError:", formatUncaughtError(reason));
return;
}
// Transient network errors (fetch failed, connection reset, etc.) shouldn't crash
// These are temporary connectivity issues that will resolve on their own
if (isTransientNetworkError(reason)) {
console.error("[clawdbot] Network error (non-fatal):", formatUncaughtError(reason));
return;
}