refactor(restart): extract stale pid cleanup and supervisor markers

This commit is contained in:
Peter Steinberger
2026-02-26 16:39:14 +01:00
parent c81e9866ff
commit 4da6a7f212
6 changed files with 259 additions and 136 deletions

View File

@@ -1,11 +1,10 @@
import { spawnSync } from "node:child_process";
import { resolveGatewayPort } from "../config/paths.js";
import {
resolveGatewayLaunchAgentLabel,
resolveGatewaySystemdServiceName,
} from "../daemon/constants.js";
import { createSubsystemLogger } from "../logging/subsystem.js";
import { resolveLsofCommandSync } from "./ports-lsof.js";
import { cleanStaleGatewayProcessesSync, findGatewayPidsOnPortSync } from "./restart-stale-pids.js";
export type RestartAttempt = {
ok: boolean;
@@ -22,6 +21,8 @@ const RESTART_COOLDOWN_MS = 30_000;
const restartLog = createSubsystemLogger("restart");
export { findGatewayPidsOnPortSync };
let sigusr1AuthorizedCount = 0;
let sigusr1AuthorizedUntil = 0;
let sigusr1ExternalAllowed = false;
@@ -285,99 +286,6 @@ function normalizeSystemdUnit(raw?: string, profile?: string): string {
return unit.endsWith(".service") ? unit : `${unit}.service`;
}
/**
* Find PIDs of gateway processes listening on the given port using synchronous lsof.
* Returns only PIDs that belong to openclaw gateway processes (not the current process).
*/
export function findGatewayPidsOnPortSync(port: number): number[] {
if (process.platform === "win32") {
return [];
}
const lsof = resolveLsofCommandSync();
const res = spawnSync(lsof, ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-Fpc"], {
encoding: "utf8",
timeout: SPAWN_TIMEOUT_MS,
});
if (res.error || res.status !== 0) {
return [];
}
const pids: number[] = [];
let currentPid: number | undefined;
let currentCmd: string | undefined;
for (const line of res.stdout.split(/\r?\n/).filter(Boolean)) {
if (line.startsWith("p")) {
if (currentPid != null && currentCmd && currentCmd.toLowerCase().includes("openclaw")) {
pids.push(currentPid);
}
const parsed = Number.parseInt(line.slice(1), 10);
currentPid = Number.isFinite(parsed) && parsed > 0 ? parsed : undefined;
currentCmd = undefined;
} else if (line.startsWith("c")) {
currentCmd = line.slice(1);
}
}
if (currentPid != null && currentCmd && currentCmd.toLowerCase().includes("openclaw")) {
pids.push(currentPid);
}
return pids.filter((pid) => pid !== process.pid);
}
const STALE_SIGTERM_WAIT_MS = 300;
const STALE_SIGKILL_WAIT_MS = 200;
/**
* Synchronously terminate stale gateway processes.
* Sends SIGTERM, waits briefly, then SIGKILL for survivors.
*/
function terminateStaleProcessesSync(pids: number[]): number[] {
if (pids.length === 0) {
return [];
}
const killed: number[] = [];
for (const pid of pids) {
try {
process.kill(pid, "SIGTERM");
killed.push(pid);
} catch {
// ESRCH — already gone
}
}
if (killed.length === 0) {
return killed;
}
spawnSync("sleep", [String(STALE_SIGTERM_WAIT_MS / 1000)], { timeout: 2000 });
for (const pid of killed) {
try {
process.kill(pid, 0);
process.kill(pid, "SIGKILL");
} catch {
// already gone
}
}
spawnSync("sleep", [String(STALE_SIGKILL_WAIT_MS / 1000)], { timeout: 2000 });
return killed;
}
/**
* Inspect the gateway port and kill any stale gateway processes holding it.
* Called before service restart commands to prevent port conflicts.
*/
function cleanStaleGatewayProcessesSync(): number[] {
try {
const port = resolveGatewayPort(undefined, process.env);
const stalePids = findGatewayPidsOnPortSync(port);
if (stalePids.length === 0) {
return [];
}
restartLog.warn(
`killing ${stalePids.length} stale gateway process(es) before restart: ${stalePids.join(", ")}`,
);
return terminateStaleProcessesSync(stalePids);
} catch {
return [];
}
}
export function triggerOpenClawRestart(): RestartAttempt {
if (process.env.VITEST || process.env.NODE_ENV === "test") {
return { ok: true, method: "supervisor", detail: "test mode" };