fix: defer gateway restart until all replies are sent (#12970)

* fix: defer gateway restart until all replies are sent

Fixes a race condition where gateway config changes (e.g., enabling
plugins via iMessage) trigger an immediate SIGUSR1 restart, killing the
iMessage RPC connection before replies are delivered.

Both restart paths (config watcher and RPC-triggered) now defer until
all queued operations, pending replies, and embedded agent runs complete
(polling every 500ms, 30s timeout). A shared emitGatewayRestart() guard
prevents double SIGUSR1 when both paths fire simultaneously.

Key changes:
- Dispatcher registry tracks active reply dispatchers globally
- markComplete() called in finally block for guaranteed cleanup
- Pre-restart deferral hook registered at gateway startup
- Centralized extractDeliveryInfo() for session key parsing
- Post-restart sentinel messages delivered directly (not via agent)
- config-patch distinguished from config-apply in sentinel kind

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix: single-source gateway restart authorization

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
Bridgerz
2026-02-13 15:29:29 -08:00
committed by GitHub
parent dc507f3dec
commit ab4a08a82a
21 changed files with 976 additions and 76 deletions

View File

@@ -2,15 +2,14 @@ import type { CliDeps } from "../cli/deps.js";
import type { loadConfig } from "../config/config.js";
import type { HeartbeatRunner } from "../infra/heartbeat-runner.js";
import type { ChannelKind, GatewayReloadPlan } from "./config-reload.js";
import { getActiveEmbeddedRunCount } from "../agents/pi-embedded-runner/runs.js";
import { getTotalPendingReplies } from "../auto-reply/reply/dispatcher-registry.js";
import { resolveAgentMaxConcurrent, resolveSubagentMaxConcurrent } from "../config/agent-limits.js";
import { startGmailWatcher, stopGmailWatcher } from "../hooks/gmail-watcher.js";
import { isTruthyEnvValue } from "../infra/env.js";
import { resetDirectoryCache } from "../infra/outbound/target-resolver.js";
import {
authorizeGatewaySigusr1Restart,
setGatewaySigusr1RestartPolicy,
} from "../infra/restart.js";
import { setCommandLaneConcurrency } from "../process/command-queue.js";
import { emitGatewayRestart, setGatewaySigusr1RestartPolicy } from "../infra/restart.js";
import { setCommandLaneConcurrency, getTotalQueueSize } from "../process/command-queue.js";
import { CommandLane } from "../process/lanes.js";
import { resolveHooksConfig } from "./hooks.js";
import { startBrowserControlServerIfEnabled } from "./server-browser.js";
@@ -140,6 +139,8 @@ export function createGatewayReloadHandlers(params: {
params.setState(nextState);
};
let restartPending = false;
const requestGatewayRestart = (
plan: GatewayReloadPlan,
nextConfig: ReturnType<typeof loadConfig>,
@@ -148,13 +149,85 @@ export function createGatewayReloadHandlers(params: {
const reasons = plan.restartReasons.length
? plan.restartReasons.join(", ")
: plan.changedPaths.join(", ");
params.logReload.warn(`config change requires gateway restart (${reasons})`);
if (process.listenerCount("SIGUSR1") === 0) {
params.logReload.warn("no SIGUSR1 listener found; restart skipped");
return;
}
authorizeGatewaySigusr1Restart();
process.emit("SIGUSR1");
// Check if there are active operations (commands in queue, pending replies, or embedded runs)
const queueSize = getTotalQueueSize();
const pendingReplies = getTotalPendingReplies();
const embeddedRuns = getActiveEmbeddedRunCount();
const totalActive = queueSize + pendingReplies + embeddedRuns;
if (totalActive > 0) {
// Avoid spinning up duplicate polling loops from repeated config changes.
if (restartPending) {
params.logReload.info(
`config change requires gateway restart (${reasons}) — already waiting for operations to complete`,
);
return;
}
restartPending = true;
const details = [];
if (queueSize > 0) {
details.push(`${queueSize} queued operation(s)`);
}
if (pendingReplies > 0) {
details.push(`${pendingReplies} pending reply(ies)`);
}
if (embeddedRuns > 0) {
details.push(`${embeddedRuns} embedded run(s)`);
}
params.logReload.warn(
`config change requires gateway restart (${reasons}) — deferring until ${details.join(", ")} complete`,
);
// Wait for all operations and replies to complete before restarting (max 30 seconds)
const maxWaitMs = 30_000;
const checkIntervalMs = 500;
const startTime = Date.now();
const checkAndRestart = () => {
const currentQueueSize = getTotalQueueSize();
const currentPendingReplies = getTotalPendingReplies();
const currentEmbeddedRuns = getActiveEmbeddedRunCount();
const currentTotalActive = currentQueueSize + currentPendingReplies + currentEmbeddedRuns;
const elapsed = Date.now() - startTime;
if (currentTotalActive === 0) {
restartPending = false;
params.logReload.info("all operations and replies completed; restarting gateway now");
emitGatewayRestart();
} else if (elapsed >= maxWaitMs) {
const remainingDetails = [];
if (currentQueueSize > 0) {
remainingDetails.push(`${currentQueueSize} operation(s)`);
}
if (currentPendingReplies > 0) {
remainingDetails.push(`${currentPendingReplies} reply(ies)`);
}
if (currentEmbeddedRuns > 0) {
remainingDetails.push(`${currentEmbeddedRuns} embedded run(s)`);
}
restartPending = false;
params.logReload.warn(
`restart timeout after ${elapsed}ms with ${remainingDetails.join(", ")} still active; restarting anyway`,
);
emitGatewayRestart();
} else {
// Check again soon
setTimeout(checkAndRestart, checkIntervalMs);
}
};
setTimeout(checkAndRestart, checkIntervalMs);
} else {
// No active operations or pending replies, restart immediately
params.logReload.warn(`config change requires gateway restart (${reasons})`);
emitGatewayRestart();
}
};
return { applyHotReload, requestGatewayRestart };