fix: harden gateway control-plane restart protections

This commit is contained in:
Peter Steinberger
2026-02-19 14:29:44 +01:00
parent 14b4c7fd56
commit ff74d89e86
11 changed files with 523 additions and 16 deletions

View File

@@ -124,6 +124,56 @@ describe("infra runtime", () => {
process.removeListener("SIGUSR1", handler);
}
});
it("coalesces duplicate scheduled restarts into a single pending timer", async () => {
const emitSpy = vi.spyOn(process, "emit");
const handler = () => {};
process.on("SIGUSR1", handler);
try {
const first = scheduleGatewaySigusr1Restart({ delayMs: 1_000, reason: "first" });
const second = scheduleGatewaySigusr1Restart({ delayMs: 1_000, reason: "second" });
expect(first.coalesced).toBe(false);
expect(second.coalesced).toBe(true);
await vi.advanceTimersByTimeAsync(999);
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
await vi.advanceTimersByTimeAsync(1);
const sigusr1Emits = emitSpy.mock.calls.filter((args) => args[0] === "SIGUSR1");
expect(sigusr1Emits.length).toBe(1);
} finally {
process.removeListener("SIGUSR1", handler);
}
});
it("applies restart cooldown between emitted restart cycles", async () => {
const emitSpy = vi.spyOn(process, "emit");
const handler = () => {};
process.on("SIGUSR1", handler);
try {
const first = scheduleGatewaySigusr1Restart({ delayMs: 0, reason: "first" });
expect(first.coalesced).toBe(false);
expect(first.delayMs).toBe(0);
await vi.advanceTimersByTimeAsync(0);
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(true);
markGatewaySigusr1RestartHandled();
const second = scheduleGatewaySigusr1Restart({ delayMs: 0, reason: "second" });
expect(second.coalesced).toBe(false);
expect(second.delayMs).toBe(30_000);
expect(second.cooldownMsApplied).toBe(30_000);
await vi.advanceTimersByTimeAsync(29_999);
expect(emitSpy.mock.calls.filter((args) => args[0] === "SIGUSR1").length).toBe(1);
await vi.advanceTimersByTimeAsync(1);
expect(emitSpy.mock.calls.filter((args) => args[0] === "SIGUSR1").length).toBe(2);
} finally {
process.removeListener("SIGUSR1", handler);
}
});
});
describe("pre-restart deferral check", () => {

View File

@@ -3,6 +3,7 @@ import {
resolveGatewayLaunchAgentLabel,
resolveGatewaySystemdServiceName,
} from "../daemon/constants.js";
import { createSubsystemLogger } from "../logging/subsystem.js";
export type RestartAttempt = {
ok: boolean;
@@ -15,6 +16,9 @@ const SPAWN_TIMEOUT_MS = 2000;
const SIGUSR1_AUTH_GRACE_MS = 5000;
const DEFAULT_DEFERRAL_POLL_MS = 500;
const DEFAULT_DEFERRAL_MAX_WAIT_MS = 30_000;
const RESTART_COOLDOWN_MS = 30_000;
const restartLog = createSubsystemLogger("restart");
let sigusr1AuthorizedCount = 0;
let sigusr1AuthorizedUntil = 0;
@@ -23,11 +27,65 @@ let preRestartCheck: (() => number) | null = null;
let restartCycleToken = 0;
let emittedRestartToken = 0;
let consumedRestartToken = 0;
let lastRestartEmittedAt = 0;
let pendingRestartTimer: ReturnType<typeof setTimeout> | null = null;
let pendingRestartDueAt = 0;
let pendingRestartReason: string | undefined;
function hasUnconsumedRestartSignal(): boolean {
return emittedRestartToken > consumedRestartToken;
}
function clearPendingScheduledRestart(): void {
if (pendingRestartTimer) {
clearTimeout(pendingRestartTimer);
}
pendingRestartTimer = null;
pendingRestartDueAt = 0;
pendingRestartReason = undefined;
}
export type RestartAuditInfo = {
actor?: string;
deviceId?: string;
clientIp?: string;
changedPaths?: string[];
};
function summarizeChangedPaths(paths: string[] | undefined, maxPaths = 6): string | null {
if (!Array.isArray(paths) || paths.length === 0) {
return null;
}
if (paths.length <= maxPaths) {
return paths.join(",");
}
const head = paths.slice(0, maxPaths).join(",");
return `${head},+${paths.length - maxPaths} more`;
}
function formatRestartAudit(audit: RestartAuditInfo | undefined): string {
const actor = typeof audit?.actor === "string" && audit.actor.trim() ? audit.actor.trim() : null;
const deviceId =
typeof audit?.deviceId === "string" && audit.deviceId.trim() ? audit.deviceId.trim() : null;
const clientIp =
typeof audit?.clientIp === "string" && audit.clientIp.trim() ? audit.clientIp.trim() : null;
const changed = summarizeChangedPaths(audit?.changedPaths);
const fields = [];
if (actor) {
fields.push(`actor=${actor}`);
}
if (deviceId) {
fields.push(`device=${deviceId}`);
}
if (clientIp) {
fields.push(`ip=${clientIp}`);
}
if (changed) {
fields.push(`changedPaths=${changed}`);
}
return fields.length > 0 ? fields.join(" ") : "actor=<unknown>";
}
/**
* Register a callback that scheduleGatewaySigusr1Restart checks before emitting SIGUSR1.
* The callback should return the number of pending items (0 = safe to restart).
@@ -44,8 +102,10 @@ export function setPreRestartDeferralCheck(fn: () => number): void {
*/
export function emitGatewayRestart(): boolean {
if (hasUnconsumedRestartSignal()) {
clearPendingScheduledRestart();
return false;
}
clearPendingScheduledRestart();
const cycleToken = ++restartCycleToken;
emittedRestartToken = cycleToken;
authorizeGatewaySigusr1Restart();
@@ -60,6 +120,7 @@ export function emitGatewayRestart(): boolean {
emittedRestartToken = consumedRestartToken;
return false;
}
lastRestartEmittedAt = Date.now();
return true;
}
@@ -293,11 +354,14 @@ export type ScheduledRestart = {
delayMs: number;
reason?: string;
mode: "emit" | "signal";
coalesced: boolean;
cooldownMsApplied: number;
};
export function scheduleGatewaySigusr1Restart(opts?: {
delayMs?: number;
reason?: string;
audit?: RestartAuditInfo;
}): ScheduledRestart {
const delayMsRaw =
typeof opts?.delayMs === "number" && Number.isFinite(opts.delayMs)
@@ -308,22 +372,77 @@ export function scheduleGatewaySigusr1Restart(opts?: {
typeof opts?.reason === "string" && opts.reason.trim()
? opts.reason.trim().slice(0, 200)
: undefined;
const mode = process.listenerCount("SIGUSR1") > 0 ? "emit" : "signal";
const nowMs = Date.now();
const cooldownMsApplied = Math.max(0, lastRestartEmittedAt + RESTART_COOLDOWN_MS - nowMs);
const requestedDueAt = nowMs + delayMs + cooldownMsApplied;
setTimeout(() => {
const pendingCheck = preRestartCheck;
if (!pendingCheck) {
emitGatewayRestart();
return;
if (hasUnconsumedRestartSignal()) {
restartLog.warn(
`restart request coalesced (already in-flight) reason=${reason ?? "unspecified"} ${formatRestartAudit(opts?.audit)}`,
);
return {
ok: true,
pid: process.pid,
signal: "SIGUSR1",
delayMs: 0,
reason,
mode,
coalesced: true,
cooldownMsApplied,
};
}
if (pendingRestartTimer) {
const remainingMs = Math.max(0, pendingRestartDueAt - nowMs);
const shouldPullEarlier = requestedDueAt < pendingRestartDueAt;
if (shouldPullEarlier) {
restartLog.warn(
`restart request rescheduled earlier reason=${reason ?? "unspecified"} pendingReason=${pendingRestartReason ?? "unspecified"} oldDelayMs=${remainingMs} newDelayMs=${Math.max(0, requestedDueAt - nowMs)} ${formatRestartAudit(opts?.audit)}`,
);
clearPendingScheduledRestart();
} else {
restartLog.warn(
`restart request coalesced (already scheduled) reason=${reason ?? "unspecified"} pendingReason=${pendingRestartReason ?? "unspecified"} delayMs=${remainingMs} ${formatRestartAudit(opts?.audit)}`,
);
return {
ok: true,
pid: process.pid,
signal: "SIGUSR1",
delayMs: remainingMs,
reason,
mode,
coalesced: true,
cooldownMsApplied,
};
}
deferGatewayRestartUntilIdle({ getPendingCount: pendingCheck });
}, delayMs);
}
pendingRestartDueAt = requestedDueAt;
pendingRestartReason = reason;
pendingRestartTimer = setTimeout(
() => {
pendingRestartTimer = null;
pendingRestartDueAt = 0;
pendingRestartReason = undefined;
const pendingCheck = preRestartCheck;
if (!pendingCheck) {
emitGatewayRestart();
return;
}
deferGatewayRestartUntilIdle({ getPendingCount: pendingCheck });
},
Math.max(0, requestedDueAt - nowMs),
);
return {
ok: true,
pid: process.pid,
signal: "SIGUSR1",
delayMs,
delayMs: Math.max(0, requestedDueAt - nowMs),
reason,
mode: process.listenerCount("SIGUSR1") > 0 ? "emit" : "signal",
mode,
coalesced: false,
cooldownMsApplied,
};
}
@@ -336,5 +455,7 @@ export const __testing = {
restartCycleToken = 0;
emittedRestartToken = 0;
consumedRestartToken = 0;
lastRestartEmittedAt = 0;
clearPendingScheduledRestart();
},
};