feat: ACP thread-bound agents (#23580)

* docs: add ACP thread-bound agents plan doc

* docs: expand ACP implementation specification

* feat(acp): route ACP sessions through core dispatch and lifecycle cleanup

* feat(acp): add /acp commands and Discord spawn gate

* ACP: add acpx runtime plugin backend

* fix(subagents): defer transient lifecycle errors before announce

* Agents: harden ACP sessions_spawn and tighten spawn guidance

* Agents: require explicit ACP target for runtime spawns

* docs: expand ACP control-plane implementation plan

* ACP: harden metadata seeding and spawn guidance

* ACP: centralize runtime control-plane manager and fail-closed dispatch

* ACP: harden runtime manager and unify spawn helpers

* Commands: route ACP sessions through ACP runtime in agent command

* ACP: require persisted metadata for runtime spawns

* Sessions: preserve ACP metadata when updating entries

* Plugins: harden ACP backend registry across loaders

* ACPX: make availability probe compatible with adapters

* E2E: add manual Discord ACP plain-language smoke script

* ACPX: preserve streamed spacing across Discord delivery

* Docs: add ACP Discord streaming strategy

* ACP: harden Discord stream buffering for thread replies

* ACP: reuse shared block reply pipeline for projector

* ACP: unify streaming config and adopt coalesceIdleMs

* Docs: add temporary ACP production hardening plan

* Docs: trim temporary ACP hardening plan goals

* Docs: gate ACP thread controls by backend capabilities

* ACP: add capability-gated runtime controls and /acp operator commands

* Docs: remove temporary ACP hardening plan

* ACP: fix spawn target validation and close cache cleanup

* ACP: harden runtime dispatch and recovery paths

* ACP: split ACP command/runtime internals and centralize policy

* ACP: harden runtime lifecycle, validation, and observability

* ACP: surface runtime and backend session IDs in thread bindings

* docs: add temp plan for binding-service migration

* ACP: migrate thread binding flows to SessionBindingService

* ACP: address review feedback and preserve prompt wording

* ACPX plugin: pin runtime dependency and prefer bundled CLI

* Discord: complete binding-service migration cleanup and restore ACP plan

* Docs: add standalone ACP agents guide

* ACP: route harness intents to thread-bound ACP sessions

* ACP: fix spawn thread routing and queue-owner stall

* ACP: harden startup reconciliation and command bypass handling

* ACP: fix dispatch bypass type narrowing

* ACP: align runtime metadata to agentSessionId

* ACP: normalize session identifier handling and labels

* ACP: mark thread banner session ids provisional until first reply

* ACP: stabilize session identity mapping and startup reconciliation

* ACP: add resolved session-id notices and cwd in thread intros

* Discord: prefix thread meta notices consistently

* Discord: unify ACP/thread meta notices with gear prefix

* Discord: split thread persona naming from meta formatting

* Extensions: bump acpx plugin dependency to 0.1.9

* Agents: gate ACP prompt guidance behind acp.enabled

* Docs: remove temp experiment plan docs

* Docs: scope streaming plan to holy grail refactor

* Docs: refactor ACP agents guide for human-first flow

* Docs/Skill: add ACP feature-flag guidance and direct acpx telephone-game flow

* Docs/Skill: add OpenCode and Pi to ACP harness lists

* Docs/Skill: align ACP harness list with current acpx registry

* Dev/Test: move ACP plain-language smoke script and mark as keep

* Docs/Skill: reorder ACP harness lists with Pi first

* ACP: split control-plane manager into core/types/utils modules

* Docs: refresh ACP thread-bound agents plan

* ACP: extract dispatch lane and split manager domains

* ACP: centralize binding context and remove reverse deps

* Infra: unify system message formatting

* ACP: centralize error boundaries and session id rendering

* ACP: enforce init concurrency cap and strict meta clear

* Tests: fix ACP dispatch binding mock typing

* Tests: fix Discord thread-binding mock drift and ACP request id

* ACP: gate slash bypass and persist cleared overrides

* ACPX: await pre-abort cancel before runTurn return

* Extension: pin acpx runtime dependency to 0.1.11

* Docs: add pinned acpx install strategy for ACP extension

* Extensions/acpx: enforce strict local pinned startup

* Extensions/acpx: tighten acp-router install guidance

* ACPX: retry runtime test temp-dir cleanup

* Extensions/acpx: require proactive ACPX repair for thread spawns

* Extensions/acpx: require restart offer after acpx reinstall

* extensions/acpx: remove workspace protocol devDependency

* extensions/acpx: bump pinned acpx to 0.1.13

* extensions/acpx: sync lockfile after dependency bump

* ACPX: make runtime spawn Windows-safe

* fix: align doctor-config-flow repair tests with default-account migration (#23580) (thanks @osolmaz)
This commit is contained in:
Onur Solmaz
2026-02-26 11:00:09 +01:00
committed by GitHub
parent a9d9a968ed
commit a7d56e3554
151 changed files with 19005 additions and 324 deletions

View File

@@ -66,6 +66,12 @@ const MAX_ANNOUNCE_RETRY_COUNT = 3;
*/
const ANNOUNCE_EXPIRY_MS = 5 * 60_000; // 5 minutes
type SubagentRunOrphanReason = "missing-session-entry" | "missing-session-id";
/**
* Embedded runs can emit transient lifecycle `error` events while provider/model
* retry is still in progress. Defer terminal error cleanup briefly so a
* subsequent lifecycle `start` / `end` can cancel premature failure announces.
*/
const LIFECYCLE_ERROR_RETRY_GRACE_MS = 15_000;
function resolveAnnounceRetryDelayMs(retryCount: number) {
const boundedRetryCount = Math.max(0, Math.min(retryCount, 10));
@@ -204,6 +210,66 @@ function reconcileOrphanedRestoredRuns() {
const resumedRuns = new Set<string>();
const endedHookInFlightRunIds = new Set<string>();
const pendingLifecycleErrorByRunId = new Map<
string,
{
timer: NodeJS.Timeout;
endedAt: number;
error?: string;
}
>();
function clearPendingLifecycleError(runId: string) {
const pending = pendingLifecycleErrorByRunId.get(runId);
if (!pending) {
return;
}
clearTimeout(pending.timer);
pendingLifecycleErrorByRunId.delete(runId);
}
function clearAllPendingLifecycleErrors() {
for (const pending of pendingLifecycleErrorByRunId.values()) {
clearTimeout(pending.timer);
}
pendingLifecycleErrorByRunId.clear();
}
function schedulePendingLifecycleError(params: { runId: string; endedAt: number; error?: string }) {
clearPendingLifecycleError(params.runId);
const timer = setTimeout(() => {
const pending = pendingLifecycleErrorByRunId.get(params.runId);
if (!pending || pending.timer !== timer) {
return;
}
pendingLifecycleErrorByRunId.delete(params.runId);
const entry = subagentRuns.get(params.runId);
if (!entry) {
return;
}
if (entry.endedReason === SUBAGENT_ENDED_REASON_COMPLETE || entry.outcome?.status === "ok") {
return;
}
void completeSubagentRun({
runId: params.runId,
endedAt: pending.endedAt,
outcome: {
status: "error",
error: pending.error,
},
reason: SUBAGENT_ENDED_REASON_ERROR,
sendFarewell: true,
accountId: entry.requesterOrigin?.accountId,
triggerCleanup: true,
});
}, LIFECYCLE_ERROR_RETRY_GRACE_MS);
timer.unref?.();
pendingLifecycleErrorByRunId.set(params.runId, {
timer,
endedAt: params.endedAt,
error: params.error,
});
}
function suppressAnnounceForSteerRestart(entry?: SubagentRunRecord) {
return entry?.suppressAnnounceReason === "steer-restart";
@@ -256,6 +322,7 @@ async function completeSubagentRun(params: {
accountId?: string;
triggerCleanup: boolean;
}) {
clearPendingLifecycleError(params.runId);
const entry = subagentRuns.get(params.runId);
if (!entry) {
return;
@@ -491,6 +558,7 @@ async function sweepSubagentRuns() {
if (!entry.archiveAtMs || entry.archiveAtMs > now) {
continue;
}
clearPendingLifecycleError(runId);
subagentRuns.delete(runId);
mutated = true;
try {
@@ -531,6 +599,7 @@ function ensureListener() {
}
const phase = evt.data?.phase;
if (phase === "start") {
clearPendingLifecycleError(evt.runId);
const startedAt = typeof evt.data?.startedAt === "number" ? evt.data.startedAt : undefined;
if (startedAt) {
entry.startedAt = startedAt;
@@ -543,17 +612,23 @@ function ensureListener() {
}
const endedAt = typeof evt.data?.endedAt === "number" ? evt.data.endedAt : Date.now();
const error = typeof evt.data?.error === "string" ? evt.data.error : undefined;
const outcome: SubagentRunOutcome =
phase === "error"
? { status: "error", error }
: evt.data?.aborted
? { status: "timeout" }
: { status: "ok" };
if (phase === "error") {
schedulePendingLifecycleError({
runId: evt.runId,
endedAt,
error,
});
return;
}
clearPendingLifecycleError(evt.runId);
const outcome: SubagentRunOutcome = evt.data?.aborted
? { status: "timeout" }
: { status: "ok" };
await completeSubagentRun({
runId: evt.runId,
endedAt,
outcome,
reason: phase === "error" ? SUBAGENT_ENDED_REASON_ERROR : SUBAGENT_ENDED_REASON_COMPLETE,
reason: SUBAGENT_ENDED_REASON_COMPLETE,
sendFarewell: true,
accountId: entry.requesterOrigin?.accountId,
triggerCleanup: true,
@@ -661,6 +736,7 @@ function completeCleanupBookkeeping(params: {
completedAt: number;
}) {
if (params.cleanup === "delete") {
clearPendingLifecycleError(params.runId);
subagentRuns.delete(params.runId);
persistSubagentRuns();
retryDeferredCompletedAnnounces(params.runId);
@@ -774,6 +850,7 @@ export function replaceSubagentRunAfterSteer(params: {
}
if (previousRunId !== nextRunId) {
clearPendingLifecycleError(previousRunId);
subagentRuns.delete(previousRunId);
resumedRuns.delete(previousRunId);
}
@@ -935,6 +1012,7 @@ export function resetSubagentRegistryForTests(opts?: { persist?: boolean }) {
subagentRuns.clear();
resumedRuns.clear();
endedHookInFlightRunIds.clear();
clearAllPendingLifecycleErrors();
resetAnnounceQueuesForTests();
stopSweeper();
restoreAttempted = false;
@@ -953,6 +1031,7 @@ export function addSubagentRunForTests(entry: SubagentRunRecord) {
}
export function releaseSubagentRun(runId: string) {
clearPendingLifecycleError(runId);
const didDelete = subagentRuns.delete(runId);
if (didDelete) {
persistSubagentRuns();
@@ -1020,6 +1099,7 @@ export function markSubagentRunTerminated(params: {
let updated = 0;
const entriesByChildSessionKey = new Map<string, SubagentRunRecord>();
for (const runId of runIds) {
clearPendingLifecycleError(runId);
const entry = subagentRuns.get(runId);
if (!entry) {
continue;