mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-08 09:01:22 +00:00
fix: harden gateway control-plane restart protections
This commit is contained in:
@@ -124,6 +124,56 @@ describe("infra runtime", () => {
|
||||
process.removeListener("SIGUSR1", handler);
|
||||
}
|
||||
});
|
||||
|
||||
it("coalesces duplicate scheduled restarts into a single pending timer", async () => {
|
||||
const emitSpy = vi.spyOn(process, "emit");
|
||||
const handler = () => {};
|
||||
process.on("SIGUSR1", handler);
|
||||
try {
|
||||
const first = scheduleGatewaySigusr1Restart({ delayMs: 1_000, reason: "first" });
|
||||
const second = scheduleGatewaySigusr1Restart({ delayMs: 1_000, reason: "second" });
|
||||
|
||||
expect(first.coalesced).toBe(false);
|
||||
expect(second.coalesced).toBe(true);
|
||||
|
||||
await vi.advanceTimersByTimeAsync(999);
|
||||
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
|
||||
|
||||
await vi.advanceTimersByTimeAsync(1);
|
||||
const sigusr1Emits = emitSpy.mock.calls.filter((args) => args[0] === "SIGUSR1");
|
||||
expect(sigusr1Emits.length).toBe(1);
|
||||
} finally {
|
||||
process.removeListener("SIGUSR1", handler);
|
||||
}
|
||||
});
|
||||
|
||||
it("applies restart cooldown between emitted restart cycles", async () => {
|
||||
const emitSpy = vi.spyOn(process, "emit");
|
||||
const handler = () => {};
|
||||
process.on("SIGUSR1", handler);
|
||||
try {
|
||||
const first = scheduleGatewaySigusr1Restart({ delayMs: 0, reason: "first" });
|
||||
expect(first.coalesced).toBe(false);
|
||||
expect(first.delayMs).toBe(0);
|
||||
|
||||
await vi.advanceTimersByTimeAsync(0);
|
||||
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(true);
|
||||
markGatewaySigusr1RestartHandled();
|
||||
|
||||
const second = scheduleGatewaySigusr1Restart({ delayMs: 0, reason: "second" });
|
||||
expect(second.coalesced).toBe(false);
|
||||
expect(second.delayMs).toBe(30_000);
|
||||
expect(second.cooldownMsApplied).toBe(30_000);
|
||||
|
||||
await vi.advanceTimersByTimeAsync(29_999);
|
||||
expect(emitSpy.mock.calls.filter((args) => args[0] === "SIGUSR1").length).toBe(1);
|
||||
|
||||
await vi.advanceTimersByTimeAsync(1);
|
||||
expect(emitSpy.mock.calls.filter((args) => args[0] === "SIGUSR1").length).toBe(2);
|
||||
} finally {
|
||||
process.removeListener("SIGUSR1", handler);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("pre-restart deferral check", () => {
|
||||
|
||||
@@ -3,6 +3,7 @@ import {
|
||||
resolveGatewayLaunchAgentLabel,
|
||||
resolveGatewaySystemdServiceName,
|
||||
} from "../daemon/constants.js";
|
||||
import { createSubsystemLogger } from "../logging/subsystem.js";
|
||||
|
||||
export type RestartAttempt = {
|
||||
ok: boolean;
|
||||
@@ -15,6 +16,9 @@ const SPAWN_TIMEOUT_MS = 2000;
|
||||
const SIGUSR1_AUTH_GRACE_MS = 5000;
|
||||
const DEFAULT_DEFERRAL_POLL_MS = 500;
|
||||
const DEFAULT_DEFERRAL_MAX_WAIT_MS = 30_000;
|
||||
const RESTART_COOLDOWN_MS = 30_000;
|
||||
|
||||
const restartLog = createSubsystemLogger("restart");
|
||||
|
||||
let sigusr1AuthorizedCount = 0;
|
||||
let sigusr1AuthorizedUntil = 0;
|
||||
@@ -23,11 +27,65 @@ let preRestartCheck: (() => number) | null = null;
|
||||
let restartCycleToken = 0;
|
||||
let emittedRestartToken = 0;
|
||||
let consumedRestartToken = 0;
|
||||
let lastRestartEmittedAt = 0;
|
||||
let pendingRestartTimer: ReturnType<typeof setTimeout> | null = null;
|
||||
let pendingRestartDueAt = 0;
|
||||
let pendingRestartReason: string | undefined;
|
||||
|
||||
function hasUnconsumedRestartSignal(): boolean {
|
||||
return emittedRestartToken > consumedRestartToken;
|
||||
}
|
||||
|
||||
function clearPendingScheduledRestart(): void {
|
||||
if (pendingRestartTimer) {
|
||||
clearTimeout(pendingRestartTimer);
|
||||
}
|
||||
pendingRestartTimer = null;
|
||||
pendingRestartDueAt = 0;
|
||||
pendingRestartReason = undefined;
|
||||
}
|
||||
|
||||
export type RestartAuditInfo = {
|
||||
actor?: string;
|
||||
deviceId?: string;
|
||||
clientIp?: string;
|
||||
changedPaths?: string[];
|
||||
};
|
||||
|
||||
function summarizeChangedPaths(paths: string[] | undefined, maxPaths = 6): string | null {
|
||||
if (!Array.isArray(paths) || paths.length === 0) {
|
||||
return null;
|
||||
}
|
||||
if (paths.length <= maxPaths) {
|
||||
return paths.join(",");
|
||||
}
|
||||
const head = paths.slice(0, maxPaths).join(",");
|
||||
return `${head},+${paths.length - maxPaths} more`;
|
||||
}
|
||||
|
||||
function formatRestartAudit(audit: RestartAuditInfo | undefined): string {
|
||||
const actor = typeof audit?.actor === "string" && audit.actor.trim() ? audit.actor.trim() : null;
|
||||
const deviceId =
|
||||
typeof audit?.deviceId === "string" && audit.deviceId.trim() ? audit.deviceId.trim() : null;
|
||||
const clientIp =
|
||||
typeof audit?.clientIp === "string" && audit.clientIp.trim() ? audit.clientIp.trim() : null;
|
||||
const changed = summarizeChangedPaths(audit?.changedPaths);
|
||||
const fields = [];
|
||||
if (actor) {
|
||||
fields.push(`actor=${actor}`);
|
||||
}
|
||||
if (deviceId) {
|
||||
fields.push(`device=${deviceId}`);
|
||||
}
|
||||
if (clientIp) {
|
||||
fields.push(`ip=${clientIp}`);
|
||||
}
|
||||
if (changed) {
|
||||
fields.push(`changedPaths=${changed}`);
|
||||
}
|
||||
return fields.length > 0 ? fields.join(" ") : "actor=<unknown>";
|
||||
}
|
||||
|
||||
/**
|
||||
* Register a callback that scheduleGatewaySigusr1Restart checks before emitting SIGUSR1.
|
||||
* The callback should return the number of pending items (0 = safe to restart).
|
||||
@@ -44,8 +102,10 @@ export function setPreRestartDeferralCheck(fn: () => number): void {
|
||||
*/
|
||||
export function emitGatewayRestart(): boolean {
|
||||
if (hasUnconsumedRestartSignal()) {
|
||||
clearPendingScheduledRestart();
|
||||
return false;
|
||||
}
|
||||
clearPendingScheduledRestart();
|
||||
const cycleToken = ++restartCycleToken;
|
||||
emittedRestartToken = cycleToken;
|
||||
authorizeGatewaySigusr1Restart();
|
||||
@@ -60,6 +120,7 @@ export function emitGatewayRestart(): boolean {
|
||||
emittedRestartToken = consumedRestartToken;
|
||||
return false;
|
||||
}
|
||||
lastRestartEmittedAt = Date.now();
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -293,11 +354,14 @@ export type ScheduledRestart = {
|
||||
delayMs: number;
|
||||
reason?: string;
|
||||
mode: "emit" | "signal";
|
||||
coalesced: boolean;
|
||||
cooldownMsApplied: number;
|
||||
};
|
||||
|
||||
export function scheduleGatewaySigusr1Restart(opts?: {
|
||||
delayMs?: number;
|
||||
reason?: string;
|
||||
audit?: RestartAuditInfo;
|
||||
}): ScheduledRestart {
|
||||
const delayMsRaw =
|
||||
typeof opts?.delayMs === "number" && Number.isFinite(opts.delayMs)
|
||||
@@ -308,22 +372,77 @@ export function scheduleGatewaySigusr1Restart(opts?: {
|
||||
typeof opts?.reason === "string" && opts.reason.trim()
|
||||
? opts.reason.trim().slice(0, 200)
|
||||
: undefined;
|
||||
const mode = process.listenerCount("SIGUSR1") > 0 ? "emit" : "signal";
|
||||
const nowMs = Date.now();
|
||||
const cooldownMsApplied = Math.max(0, lastRestartEmittedAt + RESTART_COOLDOWN_MS - nowMs);
|
||||
const requestedDueAt = nowMs + delayMs + cooldownMsApplied;
|
||||
|
||||
setTimeout(() => {
|
||||
const pendingCheck = preRestartCheck;
|
||||
if (!pendingCheck) {
|
||||
emitGatewayRestart();
|
||||
return;
|
||||
if (hasUnconsumedRestartSignal()) {
|
||||
restartLog.warn(
|
||||
`restart request coalesced (already in-flight) reason=${reason ?? "unspecified"} ${formatRestartAudit(opts?.audit)}`,
|
||||
);
|
||||
return {
|
||||
ok: true,
|
||||
pid: process.pid,
|
||||
signal: "SIGUSR1",
|
||||
delayMs: 0,
|
||||
reason,
|
||||
mode,
|
||||
coalesced: true,
|
||||
cooldownMsApplied,
|
||||
};
|
||||
}
|
||||
|
||||
if (pendingRestartTimer) {
|
||||
const remainingMs = Math.max(0, pendingRestartDueAt - nowMs);
|
||||
const shouldPullEarlier = requestedDueAt < pendingRestartDueAt;
|
||||
if (shouldPullEarlier) {
|
||||
restartLog.warn(
|
||||
`restart request rescheduled earlier reason=${reason ?? "unspecified"} pendingReason=${pendingRestartReason ?? "unspecified"} oldDelayMs=${remainingMs} newDelayMs=${Math.max(0, requestedDueAt - nowMs)} ${formatRestartAudit(opts?.audit)}`,
|
||||
);
|
||||
clearPendingScheduledRestart();
|
||||
} else {
|
||||
restartLog.warn(
|
||||
`restart request coalesced (already scheduled) reason=${reason ?? "unspecified"} pendingReason=${pendingRestartReason ?? "unspecified"} delayMs=${remainingMs} ${formatRestartAudit(opts?.audit)}`,
|
||||
);
|
||||
return {
|
||||
ok: true,
|
||||
pid: process.pid,
|
||||
signal: "SIGUSR1",
|
||||
delayMs: remainingMs,
|
||||
reason,
|
||||
mode,
|
||||
coalesced: true,
|
||||
cooldownMsApplied,
|
||||
};
|
||||
}
|
||||
deferGatewayRestartUntilIdle({ getPendingCount: pendingCheck });
|
||||
}, delayMs);
|
||||
}
|
||||
|
||||
pendingRestartDueAt = requestedDueAt;
|
||||
pendingRestartReason = reason;
|
||||
pendingRestartTimer = setTimeout(
|
||||
() => {
|
||||
pendingRestartTimer = null;
|
||||
pendingRestartDueAt = 0;
|
||||
pendingRestartReason = undefined;
|
||||
const pendingCheck = preRestartCheck;
|
||||
if (!pendingCheck) {
|
||||
emitGatewayRestart();
|
||||
return;
|
||||
}
|
||||
deferGatewayRestartUntilIdle({ getPendingCount: pendingCheck });
|
||||
},
|
||||
Math.max(0, requestedDueAt - nowMs),
|
||||
);
|
||||
return {
|
||||
ok: true,
|
||||
pid: process.pid,
|
||||
signal: "SIGUSR1",
|
||||
delayMs,
|
||||
delayMs: Math.max(0, requestedDueAt - nowMs),
|
||||
reason,
|
||||
mode: process.listenerCount("SIGUSR1") > 0 ? "emit" : "signal",
|
||||
mode,
|
||||
coalesced: false,
|
||||
cooldownMsApplied,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -336,5 +455,7 @@ export const __testing = {
|
||||
restartCycleToken = 0;
|
||||
emittedRestartToken = 0;
|
||||
consumedRestartToken = 0;
|
||||
lastRestartEmittedAt = 0;
|
||||
clearPendingScheduledRestart();
|
||||
},
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user