fix: release stale session locks and add watchdog for hung API calls (#18060)

When a model API call hangs indefinitely (e.g. Anthropic quota exceeded
mid-call), the gateway acquires a session .jsonl.lock but the promise
never resolves, so the try/finally block never reaches release(). Since
the owning PID is the gateway itself, stale detection cannot help —
isPidAlive() always returns true.

This commit adds four layers of defense:

1. **In-process lock watchdog** (session-write-lock.ts)
   - Track acquiredAt timestamp on each held lock
   - 60-second interval timer checks all held locks
   - Auto-releases any lock held longer than maxHoldMs (default 5 min)
   - Catches the hung-API-call case that try/finally cannot

2. **Gateway startup cleanup** (server-startup.ts)
   - On boot, scan all agent session directories for *.jsonl.lock files
   - Remove locks with dead PIDs or older than staleMs (30 min)
   - Log each cleaned lock for diagnostics

3. **openclaw doctor stale lock detection** (doctor-session-locks.ts)
   - New health check scans for .jsonl.lock files
   - Reports PID status and age of each lock found
   - In --fix mode, removes stale locks automatically

4. **Transcript error entry on API failure** (attempt.ts)
   - When promptError is set, write an error marker to the session
     transcript before releasing the lock
   - Preserves conversation history even on model API failures

Closes #18060
This commit is contained in:
Vishal Doshi
2026-02-16 13:57:35 +00:00
committed by Peter Steinberger
parent 7d8d8c338b
commit e91a5b0216
8 changed files with 650 additions and 46 deletions

View File

@@ -4,26 +4,51 @@ import path from "node:path";
import { isPidAlive } from "../shared/pid-alive.js";
type LockFilePayload = {
pid: number;
createdAt: string;
pid?: number;
createdAt?: string;
};
type HeldLock = {
count: number;
handle: fs.FileHandle;
lockPath: string;
acquiredAt: number;
maxHoldMs: number;
releasePromise?: Promise<void>;
};
export type SessionLockInspection = {
lockPath: string;
pid: number | null;
pidAlive: boolean;
createdAt: string | null;
ageMs: number | null;
stale: boolean;
staleReasons: string[];
removed: boolean;
};
const CLEANUP_SIGNALS = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGABRT"] as const;
type CleanupSignal = (typeof CLEANUP_SIGNALS)[number];
const CLEANUP_STATE_KEY = Symbol.for("openclaw.sessionWriteLockCleanupState");
const HELD_LOCKS_KEY = Symbol.for("openclaw.sessionWriteLockHeldLocks");
const WATCHDOG_STATE_KEY = Symbol.for("openclaw.sessionWriteLockWatchdogState");
const DEFAULT_STALE_MS = 30 * 60 * 1000;
const DEFAULT_MAX_HOLD_MS = 5 * 60 * 1000;
const DEFAULT_WATCHDOG_INTERVAL_MS = 60_000;
type CleanupState = {
registered: boolean;
cleanupHandlers: Map<CleanupSignal, () => void>;
};
type WatchdogState = {
started: boolean;
intervalMs: number;
timer?: NodeJS.Timeout;
};
function resolveHeldLocks(): Map<string, HeldLock> {
const proc = process as NodeJS.Process & {
[HELD_LOCKS_KEY]?: Map<string, HeldLock>;
@@ -49,6 +74,82 @@ function resolveCleanupState(): CleanupState {
return proc[CLEANUP_STATE_KEY];
}
function resolveWatchdogState(): WatchdogState {
const proc = process as NodeJS.Process & {
[WATCHDOG_STATE_KEY]?: WatchdogState;
};
if (!proc[WATCHDOG_STATE_KEY]) {
proc[WATCHDOG_STATE_KEY] = {
started: false,
intervalMs: DEFAULT_WATCHDOG_INTERVAL_MS,
};
}
return proc[WATCHDOG_STATE_KEY];
}
function resolvePositiveMs(
value: number | undefined,
fallback: number,
opts: { allowInfinity?: boolean } = {},
): number {
if (typeof value !== "number" || Number.isNaN(value) || value <= 0) {
return fallback;
}
if (value === Number.POSITIVE_INFINITY) {
return opts.allowInfinity ? value : fallback;
}
if (!Number.isFinite(value)) {
return fallback;
}
return value;
}
async function releaseHeldLock(
normalizedSessionFile: string,
held: HeldLock,
opts: { force?: boolean } = {},
): Promise<boolean> {
const current = HELD_LOCKS.get(normalizedSessionFile);
if (current !== held) {
return false;
}
if (opts.force) {
held.count = 0;
} else {
held.count -= 1;
if (held.count > 0) {
return false;
}
}
if (held.releasePromise) {
await held.releasePromise.catch(() => undefined);
return true;
}
HELD_LOCKS.delete(normalizedSessionFile);
held.releasePromise = (async () => {
try {
await held.handle.close();
} catch {
// Ignore errors during cleanup - best effort.
}
try {
await fs.rm(held.lockPath, { force: true });
} catch {
// Ignore errors during cleanup - best effort.
}
})();
try {
await held.releasePromise;
return true;
} finally {
held.releasePromise = undefined;
}
}
/**
* Synchronously release all held locks.
* Used during process exit when async operations aren't reliable.
@@ -71,6 +172,42 @@ function releaseAllLocksSync(): void {
}
}
async function runLockWatchdogCheck(nowMs = Date.now()): Promise<number> {
let released = 0;
for (const [sessionFile, held] of HELD_LOCKS.entries()) {
const heldForMs = nowMs - held.acquiredAt;
if (heldForMs <= held.maxHoldMs) {
continue;
}
// eslint-disable-next-line no-console
console.warn(
`[session-write-lock] releasing lock held for ${heldForMs}ms (max=${held.maxHoldMs}ms): ${held.lockPath}`,
);
const didRelease = await releaseHeldLock(sessionFile, held, { force: true });
if (didRelease) {
released += 1;
}
}
return released;
}
function ensureWatchdogStarted(intervalMs: number): void {
const watchdogState = resolveWatchdogState();
if (watchdogState.started) {
return;
}
watchdogState.started = true;
watchdogState.intervalMs = intervalMs;
watchdogState.timer = setInterval(() => {
void runLockWatchdogCheck().catch(() => {
// Ignore watchdog errors - best effort cleanup only.
});
}, intervalMs);
watchdogState.timer.unref?.();
}
function handleTerminationSignal(signal: CleanupSignal): void {
releaseAllLocksSync();
const cleanupState = resolveCleanupState();
@@ -99,6 +236,8 @@ function registerCleanupHandlers(): void {
});
}
ensureWatchdogStarted(DEFAULT_WATCHDOG_INTERVAL_MS);
// Handle termination signals
for (const signal of CLEANUP_SIGNALS) {
if (cleanupState.cleanupHandlers.has(signal)) {
@@ -117,29 +256,125 @@ function registerCleanupHandlers(): void {
async function readLockPayload(lockPath: string): Promise<LockFilePayload | null> {
try {
const raw = await fs.readFile(lockPath, "utf8");
const parsed = JSON.parse(raw) as Partial<LockFilePayload>;
if (typeof parsed.pid !== "number") {
return null;
const parsed = JSON.parse(raw) as Record<string, unknown>;
const payload: LockFilePayload = {};
if (typeof parsed.pid === "number") {
payload.pid = parsed.pid;
}
if (typeof parsed.createdAt !== "string") {
return null;
if (typeof parsed.createdAt === "string") {
payload.createdAt = parsed.createdAt;
}
return { pid: parsed.pid, createdAt: parsed.createdAt };
return payload;
} catch {
return null;
}
}
function inspectLockPayload(
payload: LockFilePayload | null,
staleMs: number,
nowMs: number,
): Pick<
SessionLockInspection,
"pid" | "pidAlive" | "createdAt" | "ageMs" | "stale" | "staleReasons"
> {
const pid = typeof payload?.pid === "number" ? payload.pid : null;
const pidAlive = pid !== null ? isPidAlive(pid) : false;
const createdAt = typeof payload?.createdAt === "string" ? payload.createdAt : null;
const createdAtMs = createdAt ? Date.parse(createdAt) : Number.NaN;
const ageMs = Number.isFinite(createdAtMs) ? Math.max(0, nowMs - createdAtMs) : null;
const staleReasons: string[] = [];
if (pid === null) {
staleReasons.push("missing-pid");
} else if (!pidAlive) {
staleReasons.push("dead-pid");
}
if (ageMs === null) {
staleReasons.push("invalid-createdAt");
} else if (ageMs > staleMs) {
staleReasons.push("too-old");
}
return {
pid,
pidAlive,
createdAt,
ageMs,
stale: staleReasons.length > 0,
staleReasons,
};
}
export async function cleanStaleLockFiles(params: {
sessionsDir: string;
staleMs?: number;
removeStale?: boolean;
nowMs?: number;
log?: {
warn?: (message: string) => void;
info?: (message: string) => void;
};
}): Promise<{ locks: SessionLockInspection[]; cleaned: SessionLockInspection[] }> {
const sessionsDir = path.resolve(params.sessionsDir);
const staleMs = resolvePositiveMs(params.staleMs, DEFAULT_STALE_MS);
const removeStale = params.removeStale !== false;
const nowMs = params.nowMs ?? Date.now();
let entries: fsSync.Dirent[] = [];
try {
entries = await fs.readdir(sessionsDir, { withFileTypes: true });
} catch (err) {
const code = (err as { code?: string }).code;
if (code === "ENOENT") {
return { locks: [], cleaned: [] };
}
throw err;
}
const locks: SessionLockInspection[] = [];
const cleaned: SessionLockInspection[] = [];
const lockEntries = entries
.filter((entry) => entry.name.endsWith(".jsonl.lock"))
.toSorted((a, b) => a.name.localeCompare(b.name));
for (const entry of lockEntries) {
const lockPath = path.join(sessionsDir, entry.name);
const payload = await readLockPayload(lockPath);
const inspected = inspectLockPayload(payload, staleMs, nowMs);
const lockInfo: SessionLockInspection = {
lockPath,
...inspected,
removed: false,
};
if (lockInfo.stale && removeStale) {
await fs.rm(lockPath, { force: true });
lockInfo.removed = true;
cleaned.push(lockInfo);
params.log?.warn?.(
`removed stale session lock: ${lockPath} (${lockInfo.staleReasons.join(", ") || "unknown"})`,
);
}
locks.push(lockInfo);
}
return { locks, cleaned };
}
export async function acquireSessionWriteLock(params: {
sessionFile: string;
timeoutMs?: number;
staleMs?: number;
maxHoldMs?: number;
}): Promise<{
release: () => Promise<void>;
}> {
registerCleanupHandlers();
const timeoutMs = params.timeoutMs ?? 10_000;
const staleMs = params.staleMs ?? 30 * 60 * 1000;
const timeoutMs = resolvePositiveMs(params.timeoutMs, 10_000, { allowInfinity: true });
const staleMs = resolvePositiveMs(params.staleMs, DEFAULT_STALE_MS);
const maxHoldMs = resolvePositiveMs(params.maxHoldMs, DEFAULT_MAX_HOLD_MS);
const sessionFile = path.resolve(params.sessionFile);
const sessionDir = path.dirname(sessionFile);
await fs.mkdir(sessionDir, { recursive: true });
@@ -151,25 +386,14 @@ export async function acquireSessionWriteLock(params: {
}
const normalizedSessionFile = path.join(normalizedDir, path.basename(sessionFile));
const lockPath = `${normalizedSessionFile}.lock`;
const release = async () => {
const current = HELD_LOCKS.get(normalizedSessionFile);
if (!current) {
return;
}
current.count -= 1;
if (current.count > 0) {
return;
}
HELD_LOCKS.delete(normalizedSessionFile);
await current.handle.close();
await fs.rm(current.lockPath, { force: true });
};
const held = HELD_LOCKS.get(normalizedSessionFile);
if (held) {
held.count += 1;
return {
release,
release: async () => {
await releaseHeldLock(normalizedSessionFile, held);
},
};
}
@@ -179,13 +403,20 @@ export async function acquireSessionWriteLock(params: {
attempt += 1;
try {
const handle = await fs.open(lockPath, "wx");
await handle.writeFile(
JSON.stringify({ pid: process.pid, createdAt: new Date().toISOString() }, null, 2),
"utf8",
);
HELD_LOCKS.set(normalizedSessionFile, { count: 1, handle, lockPath });
const createdAt = new Date().toISOString();
await handle.writeFile(JSON.stringify({ pid: process.pid, createdAt }, null, 2), "utf8");
const createdHeld: HeldLock = {
count: 1,
handle,
lockPath,
acquiredAt: Date.now(),
maxHoldMs,
};
HELD_LOCKS.set(normalizedSessionFile, createdHeld);
return {
release,
release: async () => {
await releaseHeldLock(normalizedSessionFile, createdHeld);
},
};
} catch (err) {
const code = (err as { code?: unknown }).code;
@@ -193,10 +424,8 @@ export async function acquireSessionWriteLock(params: {
throw err;
}
const payload = await readLockPayload(lockPath);
const createdAt = payload?.createdAt ? Date.parse(payload.createdAt) : NaN;
const stale = !Number.isFinite(createdAt) || Date.now() - createdAt > staleMs;
const alive = payload?.pid ? isPidAlive(payload.pid) : false;
if (stale || !alive) {
const inspected = inspectLockPayload(payload, staleMs, Date.now());
if (inspected.stale) {
await fs.rm(lockPath, { force: true });
continue;
}
@@ -207,7 +436,7 @@ export async function acquireSessionWriteLock(params: {
}
const payload = await readLockPayload(lockPath);
const owner = payload?.pid ? `pid=${payload.pid}` : "unknown";
const owner = typeof payload?.pid === "number" ? `pid=${payload.pid}` : "unknown";
throw new Error(`session file locked (timeout ${timeoutMs}ms): ${owner} ${lockPath}`);
}
@@ -215,4 +444,5 @@ export const __testing = {
cleanupSignals: [...CLEANUP_SIGNALS],
handleTerminationSignal,
releaseAllLocksSync,
runLockWatchdogCheck,
};