fix(gateway): harden channel health monitor recovery

This commit is contained in:
Peter Steinberger
2026-02-17 03:26:13 +01:00
parent 4aed4eedb7
commit 1f850374f6
7 changed files with 282 additions and 18 deletions

View File

@@ -1,13 +1,13 @@
import { resolveChannelDefaultAccountId } from "../channels/plugins/helpers.js";
import { type ChannelId, getChannelPlugin, listChannelPlugins } from "../channels/plugins/index.js";
import type { ChannelAccountSnapshot } from "../channels/plugins/types.js";
import type { OpenClawConfig } from "../config/config.js";
import type { createSubsystemLogger } from "../logging/subsystem.js";
import type { RuntimeEnv } from "../runtime.js";
import { resolveChannelDefaultAccountId } from "../channels/plugins/helpers.js";
import { type ChannelId, getChannelPlugin, listChannelPlugins } from "../channels/plugins/index.js";
import { type BackoffPolicy, computeBackoff, sleepWithAbort } from "../infra/backoff.js";
import { formatErrorMessage } from "../infra/errors.js";
import { resetDirectoryCache } from "../infra/outbound/target-resolver.js";
import type { createSubsystemLogger } from "../logging/subsystem.js";
import { DEFAULT_ACCOUNT_ID } from "../routing/session-key.js";
import type { RuntimeEnv } from "../runtime.js";
const CHANNEL_RESTART_POLICY: BackoffPolicy = {
initialMs: 5_000,
@@ -61,6 +61,11 @@ type ChannelManagerOptions = {
channelRuntimeEnvs: Record<ChannelId, RuntimeEnv>;
};
type StartChannelOptions = {
preserveRestartAttempts?: boolean;
preserveManualStop?: boolean;
};
export type ChannelManager = {
getRuntimeSnapshot: () => ChannelRuntimeSnapshot;
startChannels: () => Promise<void>;
@@ -110,12 +115,17 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
return next;
};
const startChannel = async (channelId: ChannelId, accountId?: string) => {
const startChannelInternal = async (
channelId: ChannelId,
accountId?: string,
opts: StartChannelOptions = {},
) => {
const plugin = getChannelPlugin(channelId);
const startAccount = plugin?.gateway?.startAccount;
if (!startAccount) {
return;
}
const { preserveRestartAttempts = false, preserveManualStop = false } = opts;
const cfg = loadConfig();
resetDirectoryCache({ channel: channelId, accountId });
const store = getStore(channelId);
@@ -136,6 +146,8 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
if (!enabled) {
setRuntime(channelId, id, {
accountId: id,
enabled: false,
configured: true,
running: false,
lastError: plugin.config.disabledReason?.(account, cfg) ?? "disabled",
});
@@ -149,6 +161,8 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
if (!configured) {
setRuntime(channelId, id, {
accountId: id,
enabled: true,
configured: false,
running: false,
lastError: plugin.config.unconfiguredReason?.(account, cfg) ?? "not configured",
});
@@ -156,17 +170,23 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
}
const rKey = restartKey(channelId, id);
manuallyStopped.delete(rKey);
if (!preserveManualStop) {
manuallyStopped.delete(rKey);
}
const abort = new AbortController();
store.aborts.set(id, abort);
restartAttempts.delete(rKey);
if (!preserveRestartAttempts) {
restartAttempts.delete(rKey);
}
setRuntime(channelId, id, {
accountId: id,
enabled: true,
configured: true,
running: true,
lastStartAt: Date.now(),
lastError: null,
reconnectAttempts: 0,
reconnectAttempts: preserveRestartAttempts ? (restartAttempts.get(rKey) ?? 0) : 0,
});
const log = channelLogs[channelId];
@@ -180,15 +200,13 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
getStatus: () => getRuntime(channelId, id),
setStatus: (next) => setRuntime(channelId, id, next),
});
const tracked = Promise.resolve(task)
const trackedPromise = Promise.resolve(task)
.catch((err) => {
const message = formatErrorMessage(err);
setRuntime(channelId, id, { accountId: id, lastError: message });
log.error?.(`[${id}] channel exited: ${message}`);
})
.finally(() => {
store.aborts.delete(id);
store.tasks.delete(id);
setRuntime(channelId, id, {
accountId: id,
running: false,
@@ -214,17 +232,41 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
reconnectAttempts: attempt,
});
try {
await sleepWithAbort(delayMs);
await startChannel(channelId, id);
await sleepWithAbort(delayMs, abort.signal);
if (manuallyStopped.has(rKey)) {
return;
}
if (store.tasks.get(id) === trackedPromise) {
store.tasks.delete(id);
}
if (store.aborts.get(id) === abort) {
store.aborts.delete(id);
}
await startChannelInternal(channelId, id, {
preserveRestartAttempts: true,
preserveManualStop: true,
});
} catch {
// abort or startup failure — next crash will retry
}
})
.finally(() => {
if (store.tasks.get(id) === trackedPromise) {
store.tasks.delete(id);
}
if (store.aborts.get(id) === abort) {
store.aborts.delete(id);
}
});
store.tasks.set(id, tracked);
store.tasks.set(id, trackedPromise);
}),
);
};
const startChannel = async (channelId: ChannelId, accountId?: string) => {
await startChannelInternal(channelId, accountId);
};
const stopChannel = async (channelId: ChannelId, accountId?: string) => {
const plugin = getChannelPlugin(channelId);
const store = getStore(channelId);
@@ -333,6 +375,8 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
const configured = described?.configured;
const current = store.runtimes.get(id) ?? cloneDefaultRuntime(plugin.id, id);
const next = { ...current, accountId: id };
next.enabled = enabled;
next.configured = typeof configured === "boolean" ? configured : (next.configured ?? true);
if (!next.running) {
if (!enabled) {
next.lastError ??= plugin.config.disabledReason?.(account, cfg) ?? "disabled";