fix: kill stuck ACP child processes on startup and harden sessions in discord threads (#33699)

* Gateway: resolve agent.wait for chat.send runs

* Discord: harden ACP thread binding + listener timeout

* ACPX: handle already-exited child wait

* Gateway/Discord: address PR review findings

* Discord: keep ACP error-state thread bindings on startup

* gateway: make agent.wait dedupe bridge event-driven

* discord: harden ACP probe classification and cap startup fan-out

* discord: add cooperative timeout cancellation

* discord: fix startup probe concurrency helper typing

* plugin-sdk: avoid Windows root-alias shard timeout

* plugin-sdk: keep root alias reflection path non-blocking

* discord+gateway: resolve remaining PR review findings

* gateway+discord: fix codex review regressions

* Discord/Gateway: address Codex review findings

* Gateway: keep agent.wait lifecycle active with shared run IDs

* Discord: clean up status reactions on aborted runs

* fix: add changelog note for ACP/Discord startup hardening (#33699) (thanks @dutifulbob)

---------

Co-authored-by: Onur <2453968+osolmaz@users.noreply.github.com>
This commit is contained in:
Bob
2026-03-04 10:52:28 +01:00
committed by GitHub
parent bd25182d5a
commit 61f7cea48b
30 changed files with 2568 additions and 180 deletions

View File

@@ -10,6 +10,8 @@ import {
import { GatewayCloseCodes, type GatewayPlugin } from "@buape/carbon/gateway";
import { VoicePlugin } from "@buape/carbon/voice";
import { Routes } from "discord-api-types/v10";
import { getAcpSessionManager } from "../../acp/control-plane/manager.js";
import { isAcpRuntimeError } from "../../acp/runtime/errors.js";
import { resolveTextChunkLimit } from "../../auto-reply/chunk.js";
import type { NativeCommandSpec } from "../../auto-reply/commands-registry.js";
import { listNativeCommandSpecsForConfig } from "../../auto-reply/commands-registry.js";
@@ -175,6 +177,92 @@ function appendPluginCommandSpecs(params: {
return merged;
}
const DISCORD_ACP_STATUS_PROBE_TIMEOUT_MS = 8_000;
const DISCORD_ACP_STALE_RUNNING_ACTIVITY_MS = 2 * 60 * 1000;
function isLegacyMissingSessionError(message: string): boolean {
return (
message.includes("Session is not ACP-enabled") ||
message.includes("ACP session metadata missing")
);
}
function classifyAcpStatusProbeError(params: { error: unknown; isStaleRunning: boolean }): {
status: "stale" | "uncertain";
reason: string;
} {
if (isAcpRuntimeError(params.error) && params.error.code === "ACP_SESSION_INIT_FAILED") {
return { status: "stale", reason: "session-init-failed" };
}
const message = params.error instanceof Error ? params.error.message : String(params.error);
if (isLegacyMissingSessionError(message)) {
return { status: "stale", reason: "session-missing" };
}
return params.isStaleRunning
? { status: "stale", reason: "status-error-running-stale" }
: { status: "uncertain", reason: "status-error" };
}
async function probeDiscordAcpBindingHealth(params: {
cfg: OpenClawConfig;
sessionKey: string;
storedState?: "idle" | "running" | "error";
lastActivityAt?: number;
}): Promise<{ status: "healthy" | "stale" | "uncertain"; reason?: string }> {
const manager = getAcpSessionManager();
const statusProbeAbortController = new AbortController();
const statusPromise = manager
.getSessionStatus({
cfg: params.cfg,
sessionKey: params.sessionKey,
signal: statusProbeAbortController.signal,
})
.then((status) => ({ kind: "status" as const, status }))
.catch((error: unknown) => ({ kind: "error" as const, error }));
let timeoutTimer: ReturnType<typeof setTimeout> | null = null;
const timeoutPromise = new Promise<{ kind: "timeout" }>((resolve) => {
timeoutTimer = setTimeout(
() => resolve({ kind: "timeout" }),
DISCORD_ACP_STATUS_PROBE_TIMEOUT_MS,
);
timeoutTimer.unref?.();
});
const result = await Promise.race([statusPromise, timeoutPromise]);
if (timeoutTimer) {
clearTimeout(timeoutTimer);
}
if (result.kind === "timeout") {
statusProbeAbortController.abort();
}
const runningForMs =
params.storedState === "running" && Number.isFinite(params.lastActivityAt)
? Date.now() - Math.max(0, Math.floor(params.lastActivityAt ?? 0))
: 0;
const isStaleRunning =
params.storedState === "running" && runningForMs >= DISCORD_ACP_STALE_RUNNING_ACTIVITY_MS;
if (result.kind === "timeout") {
return isStaleRunning
? { status: "stale", reason: "status-timeout-running-stale" }
: { status: "uncertain", reason: "status-timeout" };
}
if (result.kind === "error") {
return classifyAcpStatusProbeError({
error: result.error,
isStaleRunning,
});
}
if (result.status.state === "error") {
// ACP error state is recoverable (next turn can clear it), so keep the
// binding unless stronger stale signals exist.
return { status: "uncertain", reason: "status-error-state" };
}
return { status: "healthy" };
}
async function deployDiscordCommands(params: {
client: Client;
runtime: RuntimeEnv;
@@ -382,14 +470,32 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) {
})
: createNoopThreadBindingManager(account.accountId);
if (threadBindingsEnabled) {
const reconciliation = reconcileAcpThreadBindingsOnStartup({
const uncertainProbeKeys = new Set<string>();
const reconciliation = await reconcileAcpThreadBindingsOnStartup({
cfg,
accountId: account.accountId,
sendFarewell: false,
healthProbe: async ({ sessionKey, session }) => {
const probe = await probeDiscordAcpBindingHealth({
cfg,
sessionKey,
storedState: session.acp?.state,
lastActivityAt: session.acp?.lastActivityAt,
});
if (probe.status === "uncertain") {
uncertainProbeKeys.add(`${sessionKey}${probe.reason ? ` (${probe.reason})` : ""}`);
}
return probe;
},
});
if (reconciliation.removed > 0) {
logVerbose(
`discord: removed ${reconciliation.removed}/${reconciliation.checked} stale ACP thread bindings on startup for account ${account.accountId}`,
`discord: removed ${reconciliation.removed}/${reconciliation.checked} stale ACP thread bindings on startup for account ${account.accountId}: ${reconciliation.staleSessionKeys.join(", ")}`,
);
}
if (uncertainProbeKeys.size > 0) {
logVerbose(
`discord: ACP thread-binding health probe uncertain for account ${account.accountId}: ${[...uncertainProbeKeys].join(", ")}`,
);
}
}
@@ -599,6 +705,7 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) {
runtime,
setStatus: opts.setStatus,
abortSignal: opts.abortSignal,
listenerTimeoutMs: eventQueueOpts.listenerTimeout,
botUserId,
guildHistories,
historyLimit,
@@ -623,7 +730,9 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) {
registerDiscordListener(
client.listeners,
new DiscordMessageListener(messageHandler, logger, trackInboundEvent),
new DiscordMessageListener(messageHandler, logger, trackInboundEvent, {
timeoutMs: eventQueueOpts.listenerTimeout,
}),
);
const reactionListenerOptions = {
cfg,