mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-10 05:22:44 +00:00
fix: defer gateway restart until all replies are sent (#12970)
* fix: defer gateway restart until all replies are sent Fixes a race condition where gateway config changes (e.g., enabling plugins via iMessage) trigger an immediate SIGUSR1 restart, killing the iMessage RPC connection before replies are delivered. Both restart paths (config watcher and RPC-triggered) now defer until all queued operations, pending replies, and embedded agent runs complete (polling every 500ms, 30s timeout). A shared emitGatewayRestart() guard prevents double SIGUSR1 when both paths fire simultaneously. Key changes: - Dispatcher registry tracks active reply dispatchers globally - markComplete() called in finally block for guaranteed cleanup - Pre-restart deferral hook registered at gateway startup - Centralized extractDeliveryInfo() for session key parsing - Post-restart sentinel messages delivered directly (not via agent) - config-patch distinguished from config-apply in sentinel kind Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: single-source gateway restart authorization --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
@@ -2,15 +2,14 @@ import type { CliDeps } from "../cli/deps.js";
|
||||
import type { loadConfig } from "../config/config.js";
|
||||
import type { HeartbeatRunner } from "../infra/heartbeat-runner.js";
|
||||
import type { ChannelKind, GatewayReloadPlan } from "./config-reload.js";
|
||||
import { getActiveEmbeddedRunCount } from "../agents/pi-embedded-runner/runs.js";
|
||||
import { getTotalPendingReplies } from "../auto-reply/reply/dispatcher-registry.js";
|
||||
import { resolveAgentMaxConcurrent, resolveSubagentMaxConcurrent } from "../config/agent-limits.js";
|
||||
import { startGmailWatcher, stopGmailWatcher } from "../hooks/gmail-watcher.js";
|
||||
import { isTruthyEnvValue } from "../infra/env.js";
|
||||
import { resetDirectoryCache } from "../infra/outbound/target-resolver.js";
|
||||
import {
|
||||
authorizeGatewaySigusr1Restart,
|
||||
setGatewaySigusr1RestartPolicy,
|
||||
} from "../infra/restart.js";
|
||||
import { setCommandLaneConcurrency } from "../process/command-queue.js";
|
||||
import { emitGatewayRestart, setGatewaySigusr1RestartPolicy } from "../infra/restart.js";
|
||||
import { setCommandLaneConcurrency, getTotalQueueSize } from "../process/command-queue.js";
|
||||
import { CommandLane } from "../process/lanes.js";
|
||||
import { resolveHooksConfig } from "./hooks.js";
|
||||
import { startBrowserControlServerIfEnabled } from "./server-browser.js";
|
||||
@@ -140,6 +139,8 @@ export function createGatewayReloadHandlers(params: {
|
||||
params.setState(nextState);
|
||||
};
|
||||
|
||||
let restartPending = false;
|
||||
|
||||
const requestGatewayRestart = (
|
||||
plan: GatewayReloadPlan,
|
||||
nextConfig: ReturnType<typeof loadConfig>,
|
||||
@@ -148,13 +149,85 @@ export function createGatewayReloadHandlers(params: {
|
||||
const reasons = plan.restartReasons.length
|
||||
? plan.restartReasons.join(", ")
|
||||
: plan.changedPaths.join(", ");
|
||||
params.logReload.warn(`config change requires gateway restart (${reasons})`);
|
||||
|
||||
if (process.listenerCount("SIGUSR1") === 0) {
|
||||
params.logReload.warn("no SIGUSR1 listener found; restart skipped");
|
||||
return;
|
||||
}
|
||||
authorizeGatewaySigusr1Restart();
|
||||
process.emit("SIGUSR1");
|
||||
|
||||
// Check if there are active operations (commands in queue, pending replies, or embedded runs)
|
||||
const queueSize = getTotalQueueSize();
|
||||
const pendingReplies = getTotalPendingReplies();
|
||||
const embeddedRuns = getActiveEmbeddedRunCount();
|
||||
const totalActive = queueSize + pendingReplies + embeddedRuns;
|
||||
|
||||
if (totalActive > 0) {
|
||||
// Avoid spinning up duplicate polling loops from repeated config changes.
|
||||
if (restartPending) {
|
||||
params.logReload.info(
|
||||
`config change requires gateway restart (${reasons}) — already waiting for operations to complete`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
restartPending = true;
|
||||
const details = [];
|
||||
if (queueSize > 0) {
|
||||
details.push(`${queueSize} queued operation(s)`);
|
||||
}
|
||||
if (pendingReplies > 0) {
|
||||
details.push(`${pendingReplies} pending reply(ies)`);
|
||||
}
|
||||
if (embeddedRuns > 0) {
|
||||
details.push(`${embeddedRuns} embedded run(s)`);
|
||||
}
|
||||
params.logReload.warn(
|
||||
`config change requires gateway restart (${reasons}) — deferring until ${details.join(", ")} complete`,
|
||||
);
|
||||
|
||||
// Wait for all operations and replies to complete before restarting (max 30 seconds)
|
||||
const maxWaitMs = 30_000;
|
||||
const checkIntervalMs = 500;
|
||||
const startTime = Date.now();
|
||||
|
||||
const checkAndRestart = () => {
|
||||
const currentQueueSize = getTotalQueueSize();
|
||||
const currentPendingReplies = getTotalPendingReplies();
|
||||
const currentEmbeddedRuns = getActiveEmbeddedRunCount();
|
||||
const currentTotalActive = currentQueueSize + currentPendingReplies + currentEmbeddedRuns;
|
||||
const elapsed = Date.now() - startTime;
|
||||
|
||||
if (currentTotalActive === 0) {
|
||||
restartPending = false;
|
||||
params.logReload.info("all operations and replies completed; restarting gateway now");
|
||||
emitGatewayRestart();
|
||||
} else if (elapsed >= maxWaitMs) {
|
||||
const remainingDetails = [];
|
||||
if (currentQueueSize > 0) {
|
||||
remainingDetails.push(`${currentQueueSize} operation(s)`);
|
||||
}
|
||||
if (currentPendingReplies > 0) {
|
||||
remainingDetails.push(`${currentPendingReplies} reply(ies)`);
|
||||
}
|
||||
if (currentEmbeddedRuns > 0) {
|
||||
remainingDetails.push(`${currentEmbeddedRuns} embedded run(s)`);
|
||||
}
|
||||
restartPending = false;
|
||||
params.logReload.warn(
|
||||
`restart timeout after ${elapsed}ms with ${remainingDetails.join(", ")} still active; restarting anyway`,
|
||||
);
|
||||
emitGatewayRestart();
|
||||
} else {
|
||||
// Check again soon
|
||||
setTimeout(checkAndRestart, checkIntervalMs);
|
||||
}
|
||||
};
|
||||
|
||||
setTimeout(checkAndRestart, checkIntervalMs);
|
||||
} else {
|
||||
// No active operations or pending replies, restart immediately
|
||||
params.logReload.warn(`config change requires gateway restart (${reasons})`);
|
||||
emitGatewayRestart();
|
||||
}
|
||||
};
|
||||
|
||||
return { applyHotReload, requestGatewayRestart };
|
||||
|
||||
Reference in New Issue
Block a user