fix(daemon): recover Windows restarts from unknown stale listeners (openclaw#24734) thanks @chilu18

Verified:
- pnpm vitest src/cli/daemon-cli/restart-health.test.ts src/cli/gateway-cli.coverage.test.ts
- pnpm oxfmt --check src/cli/daemon-cli/restart-health.ts src/cli/daemon-cli/restart-health.test.ts
- pnpm check (fails on unrelated repo baseline tsgo errors in extensions/* and src/process/exec.windows.test.ts)

Co-authored-by: chilu18 <7957943+chilu18@users.noreply.github.com>
Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
Peter Machona
2026-03-02 14:24:25 +00:00
committed by GitHub
parent a05b8f47b1
commit c2d41dc473
3 changed files with 138 additions and 34 deletions

View File

@@ -88,6 +88,7 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
port: restartPort, port: restartPort,
attempts: POST_RESTART_HEALTH_ATTEMPTS, attempts: POST_RESTART_HEALTH_ATTEMPTS,
delayMs: POST_RESTART_HEALTH_DELAY_MS, delayMs: POST_RESTART_HEALTH_DELAY_MS,
includeUnknownListenersAsStale: process.platform === "win32",
}); });
if (!health.healthy && health.staleGatewayPids.length > 0) { if (!health.healthy && health.staleGatewayPids.length > 0) {
@@ -105,6 +106,7 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
port: restartPort, port: restartPort,
attempts: POST_RESTART_HEALTH_ATTEMPTS, attempts: POST_RESTART_HEALTH_ATTEMPTS,
delayMs: POST_RESTART_HEALTH_DELAY_MS, delayMs: POST_RESTART_HEALTH_DELAY_MS,
includeUnknownListenersAsStale: process.platform === "win32",
}); });
} }

View File

@@ -1,4 +1,4 @@
import { beforeEach, describe, expect, it, vi } from "vitest"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { GatewayService } from "../../daemon/service.js"; import type { GatewayService } from "../../daemon/service.js";
import type { PortListenerKind, PortUsage } from "../../infra/ports.js"; import type { PortListenerKind, PortUsage } from "../../infra/ports.js";
@@ -13,6 +13,8 @@ vi.mock("../../infra/ports.js", () => ({
inspectPortUsage: (port: number) => inspectPortUsage(port), inspectPortUsage: (port: number) => inspectPortUsage(port),
})); }));
const originalPlatform = process.platform;
describe("inspectGatewayRestart", () => { describe("inspectGatewayRestart", () => {
beforeEach(() => { beforeEach(() => {
inspectPortUsage.mockReset(); inspectPortUsage.mockReset();
@@ -26,6 +28,10 @@ describe("inspectGatewayRestart", () => {
classifyPortListener.mockReturnValue("gateway"); classifyPortListener.mockReturnValue("gateway");
}); });
afterEach(() => {
Object.defineProperty(process, "platform", { value: originalPlatform, configurable: true });
});
it("treats a gateway listener child pid as healthy ownership", async () => { it("treats a gateway listener child pid as healthy ownership", async () => {
const service = { const service = {
readRuntime: vi.fn(async () => ({ status: "running", pid: 7000 })), readRuntime: vi.fn(async () => ({ status: "running", pid: 7000 })),
@@ -63,4 +69,104 @@ describe("inspectGatewayRestart", () => {
expect(snapshot.healthy).toBe(false); expect(snapshot.healthy).toBe(false);
expect(snapshot.staleGatewayPids).toEqual([9000]); expect(snapshot.staleGatewayPids).toEqual([9000]);
}); });
it("treats unknown listeners as stale on Windows when enabled", async () => {
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
classifyPortListener.mockReturnValue("unknown");
const service = {
readRuntime: vi.fn(async () => ({ status: "stopped" })),
} as unknown as GatewayService;
inspectPortUsage.mockResolvedValue({
port: 18789,
status: "busy",
listeners: [{ pid: 10920, command: "unknown" }],
hints: [],
});
const { inspectGatewayRestart } = await import("./restart-health.js");
const snapshot = await inspectGatewayRestart({
service,
port: 18789,
includeUnknownListenersAsStale: true,
});
expect(snapshot.staleGatewayPids).toEqual([10920]);
});
it("does not treat unknown listeners as stale when fallback is disabled", async () => {
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
classifyPortListener.mockReturnValue("unknown");
const service = {
readRuntime: vi.fn(async () => ({ status: "stopped" })),
} as unknown as GatewayService;
inspectPortUsage.mockResolvedValue({
port: 18789,
status: "busy",
listeners: [{ pid: 10920, command: "unknown" }],
hints: [],
});
const { inspectGatewayRestart } = await import("./restart-health.js");
const snapshot = await inspectGatewayRestart({
service,
port: 18789,
includeUnknownListenersAsStale: false,
});
expect(snapshot.staleGatewayPids).toEqual([]);
});
it("does not apply unknown-listener fallback while runtime is running", async () => {
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
classifyPortListener.mockReturnValue("unknown");
const service = {
readRuntime: vi.fn(async () => ({ status: "running", pid: 10920 })),
} as unknown as GatewayService;
inspectPortUsage.mockResolvedValue({
port: 18789,
status: "busy",
listeners: [{ pid: 10920, command: "unknown" }],
hints: [],
});
const { inspectGatewayRestart } = await import("./restart-health.js");
const snapshot = await inspectGatewayRestart({
service,
port: 18789,
includeUnknownListenersAsStale: true,
});
expect(snapshot.staleGatewayPids).toEqual([]);
});
it("does not treat known non-gateway listeners as stale in fallback mode", async () => {
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
classifyPortListener.mockReturnValue("ssh");
const service = {
readRuntime: vi.fn(async () => ({ status: "stopped" })),
} as unknown as GatewayService;
inspectPortUsage.mockResolvedValue({
port: 18789,
status: "busy",
listeners: [{ pid: 22001, command: "nginx.exe" }],
hints: [],
});
const { inspectGatewayRestart } = await import("./restart-health.js");
const snapshot = await inspectGatewayRestart({
service,
port: 18789,
includeUnknownListenersAsStale: true,
});
expect(snapshot.staleGatewayPids).toEqual([]);
});
}); });

View File

@@ -6,6 +6,7 @@ import {
inspectPortUsage, inspectPortUsage,
type PortUsage, type PortUsage,
} from "../../infra/ports.js"; } from "../../infra/ports.js";
import { killProcessTree } from "../../process/kill-tree.js";
import { sleep } from "../../utils.js"; import { sleep } from "../../utils.js";
export const DEFAULT_RESTART_HEALTH_TIMEOUT_MS = 60_000; export const DEFAULT_RESTART_HEALTH_TIMEOUT_MS = 60_000;
@@ -32,6 +33,7 @@ export async function inspectGatewayRestart(params: {
service: GatewayService; service: GatewayService;
port: number; port: number;
env?: NodeJS.ProcessEnv; env?: NodeJS.ProcessEnv;
includeUnknownListenersAsStale?: boolean;
}): Promise<GatewayRestartSnapshot> { }): Promise<GatewayRestartSnapshot> {
const env = params.env ?? process.env; const env = params.env ?? process.env;
let runtime: GatewayServiceRuntime = { status: "unknown" }; let runtime: GatewayServiceRuntime = { status: "unknown" };
@@ -60,6 +62,16 @@ export async function inspectGatewayRestart(params: {
(listener) => classifyPortListener(listener, params.port) === "gateway", (listener) => classifyPortListener(listener, params.port) === "gateway",
) )
: []; : [];
const fallbackListenerPids =
params.includeUnknownListenersAsStale &&
process.platform === "win32" &&
runtime.status !== "running" &&
portUsage.status === "busy"
? portUsage.listeners
.filter((listener) => classifyPortListener(listener, params.port) === "unknown")
.map((listener) => listener.pid)
.filter((pid): pid is number => Number.isFinite(pid))
: [];
const running = runtime.status === "running"; const running = runtime.status === "running";
const runtimePid = runtime.pid; const runtimePid = runtime.pid;
const ownsPort = const ownsPort =
@@ -69,8 +81,8 @@ export async function inspectGatewayRestart(params: {
(portUsage.status === "busy" && portUsage.listeners.length === 0); (portUsage.status === "busy" && portUsage.listeners.length === 0);
const healthy = running && ownsPort; const healthy = running && ownsPort;
const staleGatewayPids = Array.from( const staleGatewayPids = Array.from(
new Set( new Set([
gatewayListeners ...gatewayListeners
.filter((listener) => Number.isFinite(listener.pid)) .filter((listener) => Number.isFinite(listener.pid))
.filter((listener) => { .filter((listener) => {
if (!running) { if (!running) {
@@ -82,7 +94,10 @@ export async function inspectGatewayRestart(params: {
return !listenerOwnedByRuntimePid({ listener, runtimePid }); return !listenerOwnedByRuntimePid({ listener, runtimePid });
}) })
.map((listener) => listener.pid as number), .map((listener) => listener.pid as number),
), ...fallbackListenerPids.filter(
(pid) => runtime.pid == null || pid !== runtime.pid || !running,
),
]),
); );
return { return {
@@ -99,6 +114,7 @@ export async function waitForGatewayHealthyRestart(params: {
attempts?: number; attempts?: number;
delayMs?: number; delayMs?: number;
env?: NodeJS.ProcessEnv; env?: NodeJS.ProcessEnv;
includeUnknownListenersAsStale?: boolean;
}): Promise<GatewayRestartSnapshot> { }): Promise<GatewayRestartSnapshot> {
const attempts = params.attempts ?? DEFAULT_RESTART_HEALTH_ATTEMPTS; const attempts = params.attempts ?? DEFAULT_RESTART_HEALTH_ATTEMPTS;
const delayMs = params.delayMs ?? DEFAULT_RESTART_HEALTH_DELAY_MS; const delayMs = params.delayMs ?? DEFAULT_RESTART_HEALTH_DELAY_MS;
@@ -107,6 +123,7 @@ export async function waitForGatewayHealthyRestart(params: {
service: params.service, service: params.service,
port: params.port, port: params.port,
env: params.env, env: params.env,
includeUnknownListenersAsStale: params.includeUnknownListenersAsStale,
}); });
for (let attempt = 0; attempt < attempts; attempt += 1) { for (let attempt = 0; attempt < attempts; attempt += 1) {
@@ -121,6 +138,7 @@ export async function waitForGatewayHealthyRestart(params: {
service: params.service, service: params.service,
port: params.port, port: params.port,
env: params.env, env: params.env,
includeUnknownListenersAsStale: params.includeUnknownListenersAsStale,
}); });
} }
@@ -156,36 +174,14 @@ export function renderRestartDiagnostics(snapshot: GatewayRestartSnapshot): stri
} }
export async function terminateStaleGatewayPids(pids: number[]): Promise<number[]> { export async function terminateStaleGatewayPids(pids: number[]): Promise<number[]> {
const killed: number[] = []; const targets = Array.from(
for (const pid of pids) { new Set(pids.filter((pid): pid is number => Number.isFinite(pid) && pid > 0)),
try { );
process.kill(pid, "SIGTERM"); for (const pid of targets) {
killed.push(pid); killProcessTree(pid, { graceMs: 300 });
} catch (err) {
const code = (err as NodeJS.ErrnoException)?.code;
if (code !== "ESRCH") {
throw err;
}
}
} }
if (targets.length > 0) {
if (killed.length === 0) { await sleep(500);
return killed;
} }
return targets;
await sleep(400);
for (const pid of killed) {
try {
process.kill(pid, 0);
process.kill(pid, "SIGKILL");
} catch (err) {
const code = (err as NodeJS.ErrnoException)?.code;
if (code !== "ESRCH") {
throw err;
}
}
}
return killed;
} }