mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-07 17:01:22 +00:00
fix: defer gateway restart until all replies are sent (#12970)
* fix: defer gateway restart until all replies are sent Fixes a race condition where gateway config changes (e.g., enabling plugins via iMessage) trigger an immediate SIGUSR1 restart, killing the iMessage RPC connection before replies are delivered. Both restart paths (config watcher and RPC-triggered) now defer until all queued operations, pending replies, and embedded agent runs complete (polling every 500ms, 30s timeout). A shared emitGatewayRestart() guard prevents double SIGUSR1 when both paths fire simultaneously. Key changes: - Dispatcher registry tracks active reply dispatchers globally - markComplete() called in finally block for guaranteed cleanup - Pre-restart deferral hook registered at gateway startup - Centralized extractDeliveryInfo() for session key parsing - Post-restart sentinel messages delivered directly (not via agent) - config-patch distinguished from config-apply in sentinel kind Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: single-source gateway restart authorization --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
@@ -9,6 +9,7 @@ import {
|
||||
isGatewaySigusr1RestartExternallyAllowed,
|
||||
scheduleGatewaySigusr1Restart,
|
||||
setGatewaySigusr1RestartPolicy,
|
||||
setPreRestartDeferralCheck,
|
||||
} from "./restart.js";
|
||||
import { createTelegramRetryRunner } from "./retry-policy.js";
|
||||
import { getShellPathFromLoginShell, resetShellPathCacheForTests } from "./shell-env.js";
|
||||
@@ -79,11 +80,15 @@ describe("infra runtime", () => {
|
||||
__testing.resetSigusr1State();
|
||||
});
|
||||
|
||||
it("consumes a scheduled authorization once", async () => {
|
||||
it("authorizes exactly once when scheduled restart emits", async () => {
|
||||
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(false);
|
||||
|
||||
scheduleGatewaySigusr1Restart({ delayMs: 0 });
|
||||
|
||||
// No pre-authorization before the scheduled emission fires.
|
||||
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(false);
|
||||
await vi.advanceTimersByTimeAsync(0);
|
||||
|
||||
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(true);
|
||||
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(false);
|
||||
|
||||
@@ -97,6 +102,110 @@ describe("infra runtime", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("pre-restart deferral check", () => {
|
||||
beforeEach(() => {
|
||||
__testing.resetSigusr1State();
|
||||
vi.useFakeTimers();
|
||||
vi.spyOn(process, "kill").mockImplementation(() => true);
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await vi.runOnlyPendingTimersAsync();
|
||||
vi.useRealTimers();
|
||||
vi.restoreAllMocks();
|
||||
__testing.resetSigusr1State();
|
||||
});
|
||||
|
||||
it("emits SIGUSR1 immediately when no deferral check is registered", async () => {
|
||||
const emitSpy = vi.spyOn(process, "emit");
|
||||
const handler = () => {};
|
||||
process.on("SIGUSR1", handler);
|
||||
try {
|
||||
scheduleGatewaySigusr1Restart({ delayMs: 0 });
|
||||
await vi.advanceTimersByTimeAsync(0);
|
||||
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
|
||||
} finally {
|
||||
process.removeListener("SIGUSR1", handler);
|
||||
}
|
||||
});
|
||||
|
||||
it("emits SIGUSR1 immediately when deferral check returns 0", async () => {
|
||||
const emitSpy = vi.spyOn(process, "emit");
|
||||
const handler = () => {};
|
||||
process.on("SIGUSR1", handler);
|
||||
try {
|
||||
setPreRestartDeferralCheck(() => 0);
|
||||
scheduleGatewaySigusr1Restart({ delayMs: 0 });
|
||||
await vi.advanceTimersByTimeAsync(0);
|
||||
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
|
||||
} finally {
|
||||
process.removeListener("SIGUSR1", handler);
|
||||
}
|
||||
});
|
||||
|
||||
it("defers SIGUSR1 until deferral check returns 0", async () => {
|
||||
const emitSpy = vi.spyOn(process, "emit");
|
||||
const handler = () => {};
|
||||
process.on("SIGUSR1", handler);
|
||||
try {
|
||||
let pending = 2;
|
||||
setPreRestartDeferralCheck(() => pending);
|
||||
scheduleGatewaySigusr1Restart({ delayMs: 0 });
|
||||
|
||||
// After initial delay fires, deferral check returns 2 — should NOT emit yet
|
||||
await vi.advanceTimersByTimeAsync(0);
|
||||
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
|
||||
|
||||
// After one poll (500ms), still pending
|
||||
await vi.advanceTimersByTimeAsync(500);
|
||||
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
|
||||
|
||||
// Drain pending work
|
||||
pending = 0;
|
||||
await vi.advanceTimersByTimeAsync(500);
|
||||
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
|
||||
} finally {
|
||||
process.removeListener("SIGUSR1", handler);
|
||||
}
|
||||
});
|
||||
|
||||
it("emits SIGUSR1 after deferral timeout even if still pending", async () => {
|
||||
const emitSpy = vi.spyOn(process, "emit");
|
||||
const handler = () => {};
|
||||
process.on("SIGUSR1", handler);
|
||||
try {
|
||||
setPreRestartDeferralCheck(() => 5); // always pending
|
||||
scheduleGatewaySigusr1Restart({ delayMs: 0 });
|
||||
|
||||
// Fire initial timeout
|
||||
await vi.advanceTimersByTimeAsync(0);
|
||||
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
|
||||
|
||||
// Advance past the 30s max deferral wait
|
||||
await vi.advanceTimersByTimeAsync(30_000);
|
||||
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
|
||||
} finally {
|
||||
process.removeListener("SIGUSR1", handler);
|
||||
}
|
||||
});
|
||||
|
||||
it("emits SIGUSR1 if deferral check throws", async () => {
|
||||
const emitSpy = vi.spyOn(process, "emit");
|
||||
const handler = () => {};
|
||||
process.on("SIGUSR1", handler);
|
||||
try {
|
||||
setPreRestartDeferralCheck(() => {
|
||||
throw new Error("boom");
|
||||
});
|
||||
scheduleGatewaySigusr1Restart({ delayMs: 0 });
|
||||
await vi.advanceTimersByTimeAsync(0);
|
||||
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
|
||||
} finally {
|
||||
process.removeListener("SIGUSR1", handler);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("getShellPathFromLoginShell", () => {
|
||||
afterEach(() => resetShellPathCacheForTests());
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import path from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
||||
import {
|
||||
consumeRestartSentinel,
|
||||
formatRestartSentinelMessage,
|
||||
readRestartSentinel,
|
||||
resolveRestartSentinelPath,
|
||||
trimLogTail,
|
||||
@@ -61,6 +62,40 @@ describe("restart sentinel", () => {
|
||||
await expect(fs.stat(filePath)).rejects.toThrow();
|
||||
});
|
||||
|
||||
it("formatRestartSentinelMessage uses custom message when present", () => {
|
||||
const payload = {
|
||||
kind: "config-apply" as const,
|
||||
status: "ok" as const,
|
||||
ts: Date.now(),
|
||||
message: "Config updated successfully",
|
||||
};
|
||||
expect(formatRestartSentinelMessage(payload)).toBe("Config updated successfully");
|
||||
});
|
||||
|
||||
it("formatRestartSentinelMessage falls back to summary when no message", () => {
|
||||
const payload = {
|
||||
kind: "update" as const,
|
||||
status: "ok" as const,
|
||||
ts: Date.now(),
|
||||
stats: { mode: "git" },
|
||||
};
|
||||
const result = formatRestartSentinelMessage(payload);
|
||||
expect(result).toContain("Gateway restart");
|
||||
expect(result).toContain("update");
|
||||
expect(result).toContain("ok");
|
||||
});
|
||||
|
||||
it("formatRestartSentinelMessage falls back to summary for blank message", () => {
|
||||
const payload = {
|
||||
kind: "restart" as const,
|
||||
status: "ok" as const,
|
||||
ts: Date.now(),
|
||||
message: " ",
|
||||
};
|
||||
const result = formatRestartSentinelMessage(payload);
|
||||
expect(result).toContain("Gateway restart");
|
||||
});
|
||||
|
||||
it("trims log tails", () => {
|
||||
const text = "a".repeat(9000);
|
||||
const trimmed = trimLogTail(text, 8000);
|
||||
|
||||
@@ -28,7 +28,7 @@ export type RestartSentinelStats = {
|
||||
};
|
||||
|
||||
export type RestartSentinelPayload = {
|
||||
kind: "config-apply" | "update" | "restart";
|
||||
kind: "config-apply" | "config-patch" | "update" | "restart";
|
||||
status: "ok" | "error" | "skipped";
|
||||
ts: number;
|
||||
sessionKey?: string;
|
||||
@@ -109,7 +109,10 @@ export async function consumeRestartSentinel(
|
||||
}
|
||||
|
||||
export function formatRestartSentinelMessage(payload: RestartSentinelPayload): string {
|
||||
return `GatewayRestart:\n${JSON.stringify(payload, null, 2)}`;
|
||||
if (payload.message?.trim()) {
|
||||
return payload.message.trim();
|
||||
}
|
||||
return summarizeRestartSentinel(payload);
|
||||
}
|
||||
|
||||
export function summarizeRestartSentinel(payload: RestartSentinelPayload): string {
|
||||
|
||||
@@ -17,6 +17,40 @@ const SIGUSR1_AUTH_GRACE_MS = 5000;
|
||||
let sigusr1AuthorizedCount = 0;
|
||||
let sigusr1AuthorizedUntil = 0;
|
||||
let sigusr1ExternalAllowed = false;
|
||||
let preRestartCheck: (() => number) | null = null;
|
||||
let sigusr1Emitted = false;
|
||||
|
||||
/**
|
||||
* Register a callback that scheduleGatewaySigusr1Restart checks before emitting SIGUSR1.
|
||||
* The callback should return the number of pending items (0 = safe to restart).
|
||||
*/
|
||||
export function setPreRestartDeferralCheck(fn: () => number): void {
|
||||
preRestartCheck = fn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit an authorized SIGUSR1 gateway restart, guarded against duplicate emissions.
|
||||
* Returns true if SIGUSR1 was emitted, false if a restart was already emitted.
|
||||
* Both scheduleGatewaySigusr1Restart and the config watcher should use this
|
||||
* to ensure only one restart fires.
|
||||
*/
|
||||
export function emitGatewayRestart(): boolean {
|
||||
if (sigusr1Emitted) {
|
||||
return false;
|
||||
}
|
||||
sigusr1Emitted = true;
|
||||
authorizeGatewaySigusr1Restart();
|
||||
try {
|
||||
if (process.listenerCount("SIGUSR1") > 0) {
|
||||
process.emit("SIGUSR1");
|
||||
} else {
|
||||
process.kill(process.pid, "SIGUSR1");
|
||||
}
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
function resetSigusr1AuthorizationIfExpired(now = Date.now()) {
|
||||
if (sigusr1AuthorizedCount <= 0) {
|
||||
@@ -37,7 +71,7 @@ export function isGatewaySigusr1RestartExternallyAllowed() {
|
||||
return sigusr1ExternalAllowed;
|
||||
}
|
||||
|
||||
export function authorizeGatewaySigusr1Restart(delayMs = 0) {
|
||||
function authorizeGatewaySigusr1Restart(delayMs = 0) {
|
||||
const delay = Math.max(0, Math.floor(delayMs));
|
||||
const expiresAt = Date.now() + delay + SIGUSR1_AUTH_GRACE_MS;
|
||||
sigusr1AuthorizedCount += 1;
|
||||
@@ -51,6 +85,10 @@ export function consumeGatewaySigusr1RestartAuthorization(): boolean {
|
||||
if (sigusr1AuthorizedCount <= 0) {
|
||||
return false;
|
||||
}
|
||||
// Reset the emission guard so the next restart cycle can fire.
|
||||
// The run loop re-enters startGatewayServer() after close(), which
|
||||
// re-registers setPreRestartDeferralCheck and can schedule new restarts.
|
||||
sigusr1Emitted = false;
|
||||
sigusr1AuthorizedCount -= 1;
|
||||
if (sigusr1AuthorizedCount <= 0) {
|
||||
sigusr1AuthorizedUntil = 0;
|
||||
@@ -189,27 +227,48 @@ export function scheduleGatewaySigusr1Restart(opts?: {
|
||||
typeof opts?.reason === "string" && opts.reason.trim()
|
||||
? opts.reason.trim().slice(0, 200)
|
||||
: undefined;
|
||||
authorizeGatewaySigusr1Restart(delayMs);
|
||||
const pid = process.pid;
|
||||
const hasListener = process.listenerCount("SIGUSR1") > 0;
|
||||
const DEFERRAL_POLL_MS = 500;
|
||||
const DEFERRAL_MAX_WAIT_MS = 30_000;
|
||||
|
||||
setTimeout(() => {
|
||||
try {
|
||||
if (hasListener) {
|
||||
process.emit("SIGUSR1");
|
||||
} else {
|
||||
process.kill(pid, "SIGUSR1");
|
||||
}
|
||||
} catch {
|
||||
/* ignore */
|
||||
if (!preRestartCheck) {
|
||||
emitGatewayRestart();
|
||||
return;
|
||||
}
|
||||
let pending: number;
|
||||
try {
|
||||
pending = preRestartCheck();
|
||||
} catch {
|
||||
emitGatewayRestart();
|
||||
return;
|
||||
}
|
||||
if (pending <= 0) {
|
||||
emitGatewayRestart();
|
||||
return;
|
||||
}
|
||||
// Poll until pending work drains or timeout
|
||||
let waited = 0;
|
||||
const poll = setInterval(() => {
|
||||
waited += DEFERRAL_POLL_MS;
|
||||
let current: number;
|
||||
try {
|
||||
current = preRestartCheck!();
|
||||
} catch {
|
||||
current = 0;
|
||||
}
|
||||
if (current <= 0 || waited >= DEFERRAL_MAX_WAIT_MS) {
|
||||
clearInterval(poll);
|
||||
emitGatewayRestart();
|
||||
}
|
||||
}, DEFERRAL_POLL_MS);
|
||||
}, delayMs);
|
||||
return {
|
||||
ok: true,
|
||||
pid,
|
||||
pid: process.pid,
|
||||
signal: "SIGUSR1",
|
||||
delayMs,
|
||||
reason,
|
||||
mode: hasListener ? "emit" : "signal",
|
||||
mode: process.listenerCount("SIGUSR1") > 0 ? "emit" : "signal",
|
||||
};
|
||||
}
|
||||
|
||||
@@ -218,5 +277,7 @@ export const __testing = {
|
||||
sigusr1AuthorizedCount = 0;
|
||||
sigusr1AuthorizedUntil = 0;
|
||||
sigusr1ExternalAllowed = false;
|
||||
preRestartCheck = null;
|
||||
sigusr1Emitted = false;
|
||||
},
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user