feat: ACP thread-bound agents (#23580)

* docs: add ACP thread-bound agents plan doc * docs: expand ACP implementation specification * feat(acp): route ACP sessions through core dispatch and lifecycle cleanup * feat(acp): add /acp commands and Discord spawn gate * ACP: add acpx runtime plugin backend * fix(subagents): defer transient lifecycle errors before announce * Agents: harden ACP sessions_spawn and tighten spawn guidance * Agents: require explicit ACP target for runtime spawns * docs: expand ACP control-plane implementation plan * ACP: harden metadata seeding and spawn guidance * ACP: centralize runtime control-plane manager and fail-closed dispatch * ACP: harden runtime manager and unify spawn helpers * Commands: route ACP sessions through ACP runtime in agent command * ACP: require persisted metadata for runtime spawns * Sessions: preserve ACP metadata when updating entries * Plugins: harden ACP backend registry across loaders * ACPX: make availability probe compatible with adapters * E2E: add manual Discord ACP plain-language smoke script * ACPX: preserve streamed spacing across Discord delivery * Docs: add ACP Discord streaming strategy * ACP: harden Discord stream buffering for thread replies * ACP: reuse shared block reply pipeline for projector * ACP: unify streaming config and adopt coalesceIdleMs * Docs: add temporary ACP production hardening plan * Docs: trim temporary ACP hardening plan goals * Docs: gate ACP thread controls by backend capabilities * ACP: add capability-gated runtime controls and /acp operator commands * Docs: remove temporary ACP hardening plan * ACP: fix spawn target validation and close cache cleanup * ACP: harden runtime dispatch and recovery paths * ACP: split ACP command/runtime internals and centralize policy * ACP: harden runtime lifecycle, validation, and observability * ACP: surface runtime and backend session IDs in thread bindings * docs: add temp plan for binding-service migration * ACP: migrate thread binding flows to SessionBindingService * ACP: address review feedback and preserve prompt wording * ACPX plugin: pin runtime dependency and prefer bundled CLI * Discord: complete binding-service migration cleanup and restore ACP plan * Docs: add standalone ACP agents guide * ACP: route harness intents to thread-bound ACP sessions * ACP: fix spawn thread routing and queue-owner stall * ACP: harden startup reconciliation and command bypass handling * ACP: fix dispatch bypass type narrowing * ACP: align runtime metadata to agentSessionId * ACP: normalize session identifier handling and labels * ACP: mark thread banner session ids provisional until first reply * ACP: stabilize session identity mapping and startup reconciliation * ACP: add resolved session-id notices and cwd in thread intros * Discord: prefix thread meta notices consistently * Discord: unify ACP/thread meta notices with gear prefix * Discord: split thread persona naming from meta formatting * Extensions: bump acpx plugin dependency to 0.1.9 * Agents: gate ACP prompt guidance behind acp.enabled * Docs: remove temp experiment plan docs * Docs: scope streaming plan to holy grail refactor * Docs: refactor ACP agents guide for human-first flow * Docs/Skill: add ACP feature-flag guidance and direct acpx telephone-game flow * Docs/Skill: add OpenCode and Pi to ACP harness lists * Docs/Skill: align ACP harness list with current acpx registry * Dev/Test: move ACP plain-language smoke script and mark as keep * Docs/Skill: reorder ACP harness lists with Pi first * ACP: split control-plane manager into core/types/utils modules * Docs: refresh ACP thread-bound agents plan * ACP: extract dispatch lane and split manager domains * ACP: centralize binding context and remove reverse deps * Infra: unify system message formatting * ACP: centralize error boundaries and session id rendering * ACP: enforce init concurrency cap and strict meta clear * Tests: fix ACP dispatch binding mock typing * Tests: fix Discord thread-binding mock drift and ACP request id * ACP: gate slash bypass and persist cleared overrides * ACPX: await pre-abort cancel before runTurn return * Extension: pin acpx runtime dependency to 0.1.11 * Docs: add pinned acpx install strategy for ACP extension * Extensions/acpx: enforce strict local pinned startup * Extensions/acpx: tighten acp-router install guidance * ACPX: retry runtime test temp-dir cleanup * Extensions/acpx: require proactive ACPX repair for thread spawns * Extensions/acpx: require restart offer after acpx reinstall * extensions/acpx: remove workspace protocol devDependency * extensions/acpx: bump pinned acpx to 0.1.13 * extensions/acpx: sync lockfile after dependency bump * ACPX: make runtime spawn Windows-safe * fix: align doctor-config-flow repair tests with default-account migration (#23580) (thanks @osolmaz)
2026-05-11 06:14:34 +00:00 · 2026-02-26 11:00:09 +01:00
parent a9d9a968ed
commit a7d56e3554
151 changed files with 19005 additions and 324 deletions
--- a/src/agents/subagent-registry.ts
+++ b/src/agents/subagent-registry.ts
@@ -66,6 +66,12 @@ const MAX_ANNOUNCE_RETRY_COUNT = 3;
 */
 const ANNOUNCE_EXPIRY_MS = 5 * 60_000; // 5 minutes
 type SubagentRunOrphanReason = "missing-session-entry" | "missing-session-id";
+/**
+ * Embedded runs can emit transient lifecycle `error` events while provider/model
+ * retry is still in progress. Defer terminal error cleanup briefly so a
+ * subsequent lifecycle `start` / `end` can cancel premature failure announces.
+ */
+const LIFECYCLE_ERROR_RETRY_GRACE_MS = 15_000;

 function resolveAnnounceRetryDelayMs(retryCount: number) {
  const boundedRetryCount = Math.max(0, Math.min(retryCount, 10));
@@ -204,6 +210,66 @@ function reconcileOrphanedRestoredRuns() {

 const resumedRuns = new Set<string>();
 const endedHookInFlightRunIds = new Set<string>();
+const pendingLifecycleErrorByRunId = new Map<
+  string,
+  {
+    timer: NodeJS.Timeout;
+    endedAt: number;
+    error?: string;
+  }
+>();
+
+function clearPendingLifecycleError(runId: string) {
+  const pending = pendingLifecycleErrorByRunId.get(runId);
+  if (!pending) {
+    return;
+  }
+  clearTimeout(pending.timer);
+  pendingLifecycleErrorByRunId.delete(runId);
+}
+
+function clearAllPendingLifecycleErrors() {
+  for (const pending of pendingLifecycleErrorByRunId.values()) {
+    clearTimeout(pending.timer);
+  }
+  pendingLifecycleErrorByRunId.clear();
+}
+
+function schedulePendingLifecycleError(params: { runId: string; endedAt: number; error?: string }) {
+  clearPendingLifecycleError(params.runId);
+  const timer = setTimeout(() => {
+    const pending = pendingLifecycleErrorByRunId.get(params.runId);
+    if (!pending || pending.timer !== timer) {
+      return;
+    }
+    pendingLifecycleErrorByRunId.delete(params.runId);
+    const entry = subagentRuns.get(params.runId);
+    if (!entry) {
+      return;
+    }
+    if (entry.endedReason === SUBAGENT_ENDED_REASON_COMPLETE || entry.outcome?.status === "ok") {
+      return;
+    }
+    void completeSubagentRun({
+      runId: params.runId,
+      endedAt: pending.endedAt,
+      outcome: {
+        status: "error",
+        error: pending.error,
+      },
+      reason: SUBAGENT_ENDED_REASON_ERROR,
+      sendFarewell: true,
+      accountId: entry.requesterOrigin?.accountId,
+      triggerCleanup: true,
+    });
+  }, LIFECYCLE_ERROR_RETRY_GRACE_MS);
+  timer.unref?.();
+  pendingLifecycleErrorByRunId.set(params.runId, {
+    timer,
+    endedAt: params.endedAt,
+    error: params.error,
+  });
+}

 function suppressAnnounceForSteerRestart(entry?: SubagentRunRecord) {
  return entry?.suppressAnnounceReason === "steer-restart";
@@ -256,6 +322,7 @@ async function completeSubagentRun(params: {
  accountId?: string;
  triggerCleanup: boolean;
 }) {
+  clearPendingLifecycleError(params.runId);
  const entry = subagentRuns.get(params.runId);
  if (!entry) {
    return;
@@ -491,6 +558,7 @@ async function sweepSubagentRuns() {
    if (!entry.archiveAtMs || entry.archiveAtMs > now) {
      continue;
    }
+    clearPendingLifecycleError(runId);
    subagentRuns.delete(runId);
    mutated = true;
    try {
@@ -531,6 +599,7 @@ function ensureListener() {
      }
      const phase = evt.data?.phase;
      if (phase === "start") {
+        clearPendingLifecycleError(evt.runId);
        const startedAt = typeof evt.data?.startedAt === "number" ? evt.data.startedAt : undefined;
        if (startedAt) {
          entry.startedAt = startedAt;
@@ -543,17 +612,23 @@ function ensureListener() {
      }
      const endedAt = typeof evt.data?.endedAt === "number" ? evt.data.endedAt : Date.now();
      const error = typeof evt.data?.error === "string" ? evt.data.error : undefined;
-      const outcome: SubagentRunOutcome =
-        phase === "error"
-          ? { status: "error", error }
-          : evt.data?.aborted
-            ? { status: "timeout" }
-            : { status: "ok" };
+      if (phase === "error") {
+        schedulePendingLifecycleError({
+          runId: evt.runId,
+          endedAt,
+          error,
+        });
+        return;
+      }
+      clearPendingLifecycleError(evt.runId);
+      const outcome: SubagentRunOutcome = evt.data?.aborted
+        ? { status: "timeout" }
+        : { status: "ok" };
      await completeSubagentRun({
        runId: evt.runId,
        endedAt,
        outcome,
-        reason: phase === "error" ? SUBAGENT_ENDED_REASON_ERROR : SUBAGENT_ENDED_REASON_COMPLETE,
+        reason: SUBAGENT_ENDED_REASON_COMPLETE,
        sendFarewell: true,
        accountId: entry.requesterOrigin?.accountId,
        triggerCleanup: true,
@@ -661,6 +736,7 @@ function completeCleanupBookkeeping(params: {
  completedAt: number;
 }) {
  if (params.cleanup === "delete") {
+    clearPendingLifecycleError(params.runId);
    subagentRuns.delete(params.runId);
    persistSubagentRuns();
    retryDeferredCompletedAnnounces(params.runId);
@@ -774,6 +850,7 @@ export function replaceSubagentRunAfterSteer(params: {
  }

  if (previousRunId !== nextRunId) {
+    clearPendingLifecycleError(previousRunId);
    subagentRuns.delete(previousRunId);
    resumedRuns.delete(previousRunId);
  }
@@ -935,6 +1012,7 @@ export function resetSubagentRegistryForTests(opts?: { persist?: boolean }) {
  subagentRuns.clear();
  resumedRuns.clear();
  endedHookInFlightRunIds.clear();
+  clearAllPendingLifecycleErrors();
  resetAnnounceQueuesForTests();
  stopSweeper();
  restoreAttempted = false;
@@ -953,6 +1031,7 @@ export function addSubagentRunForTests(entry: SubagentRunRecord) {
 }

 export function releaseSubagentRun(runId: string) {
+  clearPendingLifecycleError(runId);
  const didDelete = subagentRuns.delete(runId);
  if (didDelete) {
    persistSubagentRuns();
@@ -1020,6 +1099,7 @@ export function markSubagentRunTerminated(params: {
  let updated = 0;
  const entriesByChildSessionKey = new Map<string, SubagentRunRecord>();
  for (const runId of runIds) {
+    clearPendingLifecycleError(runId);
    const entry = subagentRuns.get(runId);
    if (!entry) {
      continue;