fix(gateway): gracefully handle AbortError and transient network errors (#2451)

* fix(tts): generate audio when block streaming drops final reply

When block streaming succeeds, final replies are dropped but TTS was only
applied to final replies. Fix by accumulating block text during streaming
and generating TTS-only audio after streaming completes.

Also:
- Change truncate vs skip behavior when summary OFF (now truncates)
- Align TTS limits with Telegram max (4096 chars)
- Improve /tts command help messages with examples
- Add newline separator between accumulated blocks

* fix(tts): add error handling for accumulated block TTS

* feat(tts): add descriptive inline menu with action descriptions

- Add value/label support for command arg choices
- TTS menu now shows descriptive title listing each action
- Capitalize button labels (On, Off, Status, etc.)
- Update Telegram, Discord, and Slack handlers to use labels

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(gateway): gracefully handle AbortError and transient network errors

Addresses issues #1851, #1997, and #2034.

During config reload (SIGUSR1), in-flight requests are aborted, causing
AbortError exceptions. Similarly, transient network errors (fetch failed,
ECONNRESET, ETIMEDOUT, etc.) can crash the gateway unnecessarily.

This change:
- Adds isAbortError() to detect intentional cancellations
- Adds isTransientNetworkError() to detect temporary connectivity issues
- Logs these errors appropriately instead of crashing
- Handles nested cause chains and AggregateError

AbortError is logged as a warning (expected during shutdown).
Network errors are logged as non-fatal errors (will resolve on their own).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(test): update commands-registry test expectations

Update test expectations to match new ResolvedCommandArgChoice format
(choices now return {label, value} objects instead of plain strings).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix: harden unhandled rejection handling and tts menus (#2451) (thanks @Glucksberg)

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
Co-authored-by: Shadow <hi@shadowing.dev>
This commit is contained in:
Glucksberg
2026-01-26 21:51:53 -04:00
committed by GitHub
parent d8e5dd91ba
commit 481bd333eb
14 changed files with 487 additions and 159 deletions

View File

@@ -1,11 +1,88 @@
import process from "node:process";
import { formatErrorMessage, formatUncaughtError } from "./errors.js";
import { formatUncaughtError } from "./errors.js";
type UnhandledRejectionHandler = (reason: unknown) => boolean;
const handlers = new Set<UnhandledRejectionHandler>();
/**
* Checks if an error is an AbortError.
* These are typically intentional cancellations (e.g., during shutdown) and shouldn't crash.
*/
export function isAbortError(err: unknown): boolean {
if (!err || typeof err !== "object") return false;
const name = "name" in err ? String(err.name) : "";
if (name === "AbortError") return true;
// Check for "This operation was aborted" message from Node's undici
const message = "message" in err && typeof err.message === "string" ? err.message : "";
if (message === "This operation was aborted") return true;
return false;
}
// Network error codes that indicate transient failures (shouldn't crash the gateway)
const TRANSIENT_NETWORK_CODES = new Set([
"ECONNRESET",
"ECONNREFUSED",
"ENOTFOUND",
"ETIMEDOUT",
"ESOCKETTIMEDOUT",
"ECONNABORTED",
"EPIPE",
"EHOSTUNREACH",
"ENETUNREACH",
"EAI_AGAIN",
"UND_ERR_CONNECT_TIMEOUT",
"UND_ERR_SOCKET",
"UND_ERR_HEADERS_TIMEOUT",
"UND_ERR_BODY_TIMEOUT",
]);
function getErrorCode(err: unknown): string | undefined {
if (!err || typeof err !== "object") return undefined;
const code = (err as { code?: unknown }).code;
return typeof code === "string" ? code : undefined;
}
function getErrorCause(err: unknown): unknown {
if (!err || typeof err !== "object") return undefined;
return (err as { cause?: unknown }).cause;
}
/**
* Checks if an error is a transient network error that shouldn't crash the gateway.
* These are typically temporary connectivity issues that will resolve on their own.
*/
export function isTransientNetworkError(err: unknown): boolean {
if (!err) return false;
// Check the error itself
const code = getErrorCode(err);
if (code && TRANSIENT_NETWORK_CODES.has(code)) return true;
// "fetch failed" TypeError from undici (Node's native fetch)
if (err instanceof TypeError && err.message === "fetch failed") {
const cause = getErrorCause(err);
// The cause often contains the actual network error
if (cause) return isTransientNetworkError(cause);
// Even without a cause, "fetch failed" is typically a network issue
return true;
}
// Check the cause chain recursively
const cause = getErrorCause(err);
if (cause && cause !== err) {
return isTransientNetworkError(cause);
}
// AggregateError may wrap multiple causes
if (err instanceof AggregateError && err.errors?.length) {
return err.errors.some((e) => isTransientNetworkError(e));
}
return false;
}
export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHandler): () => void {
handlers.add(handler);
return () => {
@@ -13,36 +90,6 @@ export function registerUnhandledRejectionHandler(handler: UnhandledRejectionHan
};
}
/**
* Check if an error is a recoverable/transient error that shouldn't crash the process.
* These include network errors and abort signals during shutdown.
*/
function isRecoverableError(reason: unknown): boolean {
if (!reason) return false;
// Check error name for AbortError
if (reason instanceof Error && reason.name === "AbortError") {
return true;
}
const message = reason instanceof Error ? reason.message : formatErrorMessage(reason);
const lowerMessage = message.toLowerCase();
return (
lowerMessage.includes("fetch failed") ||
lowerMessage.includes("network request") ||
lowerMessage.includes("econnrefused") ||
lowerMessage.includes("econnreset") ||
lowerMessage.includes("etimedout") ||
lowerMessage.includes("socket hang up") ||
lowerMessage.includes("enotfound") ||
lowerMessage.includes("network error") ||
lowerMessage.includes("getaddrinfo") ||
lowerMessage.includes("client network socket disconnected") ||
lowerMessage.includes("this operation was aborted") ||
lowerMessage.includes("aborted")
);
}
export function isUnhandledRejectionHandled(reason: unknown): boolean {
for (const handler of handlers) {
try {
@@ -61,9 +108,17 @@ export function installUnhandledRejectionHandler(): void {
process.on("unhandledRejection", (reason, _promise) => {
if (isUnhandledRejectionHandled(reason)) return;
// Don't crash on recoverable/transient errors - log them and continue
if (isRecoverableError(reason)) {
console.error("[clawdbot] Recoverable error (not crashing):", formatUncaughtError(reason));
// AbortError is typically an intentional cancellation (e.g., during shutdown)
// Log it but don't crash - these are expected during graceful shutdown
if (isAbortError(reason)) {
console.warn("[clawdbot] Suppressed AbortError:", formatUncaughtError(reason));
return;
}
// Transient network errors (fetch failed, connection reset, etc.) shouldn't crash
// These are temporary connectivity issues that will resolve on their own
if (isTransientNetworkError(reason)) {
console.error("[clawdbot] Network error (non-fatal):", formatUncaughtError(reason));
return;
}