mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-08 07:11:25 +00:00
fix(gateway): probe port liveness for stale lock recovery
Co-authored-by: Operative-001 <261882263+Operative-001@users.noreply.github.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import { createHash } from "node:crypto";
|
||||
import fsSync from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import net from "node:net";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterAll, afterEach, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
@@ -129,6 +130,35 @@ async function acquireStaleLinuxLock(env: NodeJS.ProcessEnv) {
|
||||
staleProcSpy.mockRestore();
|
||||
}
|
||||
|
||||
async function listenOnLoopbackPort() {
|
||||
const server = net.createServer();
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
server.once("error", reject);
|
||||
server.listen(0, "127.0.0.1", () => {
|
||||
server.off("error", reject);
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
const address = server.address();
|
||||
if (!address || typeof address === "string") {
|
||||
throw new Error("failed to resolve loopback test port");
|
||||
}
|
||||
return {
|
||||
port: address.port,
|
||||
close: async () => {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
server.close((err) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
return;
|
||||
}
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe("gateway lock", () => {
|
||||
beforeAll(async () => {
|
||||
fixtureRoot = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-gateway-lock-"));
|
||||
@@ -227,6 +257,50 @@ describe("gateway lock", () => {
|
||||
statSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("treats lock as stale when owner pid is alive but configured port is free", async () => {
|
||||
vi.useRealTimers();
|
||||
const env = await makeEnv();
|
||||
await writeLockFile(env, {
|
||||
startTime: 111,
|
||||
createdAt: new Date().toISOString(),
|
||||
});
|
||||
const listener = await listenOnLoopbackPort();
|
||||
const port = listener.port;
|
||||
await listener.close();
|
||||
|
||||
const lock = await acquireForTest(env, {
|
||||
timeoutMs: 80,
|
||||
pollIntervalMs: 5,
|
||||
staleMs: 10_000,
|
||||
platform: "darwin",
|
||||
port,
|
||||
});
|
||||
expect(lock).not.toBeNull();
|
||||
await lock?.release();
|
||||
});
|
||||
|
||||
it("keeps lock when configured port is busy and owner pid is alive", async () => {
|
||||
vi.useRealTimers();
|
||||
const env = await makeEnv();
|
||||
await writeLockFile(env, {
|
||||
startTime: 111,
|
||||
createdAt: new Date().toISOString(),
|
||||
});
|
||||
const listener = await listenOnLoopbackPort();
|
||||
try {
|
||||
const pending = acquireForTest(env, {
|
||||
timeoutMs: 20,
|
||||
pollIntervalMs: 2,
|
||||
staleMs: 10_000,
|
||||
platform: "darwin",
|
||||
port: listener.port,
|
||||
});
|
||||
await expect(pending).rejects.toBeInstanceOf(GatewayLockError);
|
||||
} finally {
|
||||
await listener.close();
|
||||
}
|
||||
});
|
||||
|
||||
it("returns null when multi-gateway override is enabled", async () => {
|
||||
const env = await makeEnv();
|
||||
const lock = await acquireGatewayLock({
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import { createHash } from "node:crypto";
|
||||
import fsSync from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import net from "node:net";
|
||||
import path from "node:path";
|
||||
import { resolveConfigPath, resolveGatewayLockDir, resolveStateDir } from "../config/paths.js";
|
||||
import { isPidAlive } from "../shared/pid-alive.js";
|
||||
@@ -8,6 +9,7 @@ import { isPidAlive } from "../shared/pid-alive.js";
|
||||
const DEFAULT_TIMEOUT_MS = 5000;
|
||||
const DEFAULT_POLL_INTERVAL_MS = 100;
|
||||
const DEFAULT_STALE_MS = 30_000;
|
||||
const DEFAULT_PORT_PROBE_TIMEOUT_MS = 1000;
|
||||
|
||||
type LockPayload = {
|
||||
pid: number;
|
||||
@@ -29,6 +31,7 @@ export type GatewayLockOptions = {
|
||||
staleMs?: number;
|
||||
allowInTests?: boolean;
|
||||
platform?: NodeJS.Platform;
|
||||
port?: number;
|
||||
};
|
||||
|
||||
export class GatewayLockError extends Error {
|
||||
@@ -100,11 +103,47 @@ function readLinuxStartTime(pid: number): number | null {
|
||||
}
|
||||
}
|
||||
|
||||
function resolveGatewayOwnerStatus(
|
||||
async function checkPortFree(port: number, host = "127.0.0.1"): Promise<boolean> {
|
||||
return await new Promise<boolean>((resolve) => {
|
||||
const socket = net.createConnection({ port, host });
|
||||
let settled = false;
|
||||
const finish = (result: boolean) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
settled = true;
|
||||
clearTimeout(timer);
|
||||
socket.removeAllListeners();
|
||||
socket.destroy();
|
||||
resolve(result);
|
||||
};
|
||||
const timer = setTimeout(() => {
|
||||
// Conservative for liveness checks: timeout usually means no responsive
|
||||
// local listener, so treat the lock owner as stale.
|
||||
finish(true);
|
||||
}, DEFAULT_PORT_PROBE_TIMEOUT_MS);
|
||||
socket.once("connect", () => {
|
||||
finish(false);
|
||||
});
|
||||
socket.once("error", () => {
|
||||
finish(true);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function resolveGatewayOwnerStatus(
|
||||
pid: number,
|
||||
payload: LockPayload | null,
|
||||
platform: NodeJS.Platform,
|
||||
): LockOwnerStatus {
|
||||
port: number | undefined,
|
||||
): Promise<LockOwnerStatus> {
|
||||
if (port != null) {
|
||||
const portFree = await checkPortFree(port);
|
||||
if (portFree) {
|
||||
return "dead";
|
||||
}
|
||||
}
|
||||
|
||||
if (!isPidAlive(pid)) {
|
||||
return "dead";
|
||||
}
|
||||
@@ -178,6 +217,7 @@ export async function acquireGatewayLock(
|
||||
const pollIntervalMs = opts.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS;
|
||||
const staleMs = opts.staleMs ?? DEFAULT_STALE_MS;
|
||||
const platform = opts.platform ?? process.platform;
|
||||
const port = opts.port;
|
||||
const { lockPath, configPath } = resolveGatewayLockPath(env);
|
||||
await fs.mkdir(path.dirname(lockPath), { recursive: true });
|
||||
|
||||
@@ -214,7 +254,7 @@ export async function acquireGatewayLock(
|
||||
lastPayload = await readLockPayload(lockPath);
|
||||
const ownerPid = lastPayload?.pid;
|
||||
const ownerStatus = ownerPid
|
||||
? resolveGatewayOwnerStatus(ownerPid, lastPayload, platform)
|
||||
? await resolveGatewayOwnerStatus(ownerPid, lastPayload, platform, port)
|
||||
: "unknown";
|
||||
if (ownerStatus === "dead" && ownerPid) {
|
||||
await fs.rm(lockPath, { force: true });
|
||||
|
||||
Reference in New Issue
Block a user