fix: release stale session locks and add watchdog for hung API calls (#18060)

When a model API call hangs indefinitely (e.g. Anthropic quota exceeded
mid-call), the gateway acquires a session .jsonl.lock but the promise
never resolves, so the try/finally block never reaches release(). Since
the owning PID is the gateway itself, stale detection cannot help —
isPidAlive() always returns true.

This commit adds four layers of defense:

1. **In-process lock watchdog** (session-write-lock.ts)
   - Track acquiredAt timestamp on each held lock
   - 60-second interval timer checks all held locks
   - Auto-releases any lock held longer than maxHoldMs (default 5 min)
   - Catches the hung-API-call case that try/finally cannot

2. **Gateway startup cleanup** (server-startup.ts)
   - On boot, scan all agent session directories for *.jsonl.lock files
   - Remove locks with dead PIDs or older than staleMs (30 min)
   - Log each cleaned lock for diagnostics

3. **openclaw doctor stale lock detection** (doctor-session-locks.ts)
   - New health check scans for .jsonl.lock files
   - Reports PID status and age of each lock found
   - In --fix mode, removes stale locks automatically

4. **Transcript error entry on API failure** (attempt.ts)
   - When promptError is set, write an error marker to the session
     transcript before releasing the lock
   - Preserves conversation history even on model API failures

Closes #18060
This commit is contained in:
Vishal Doshi
2026-02-16 13:57:35 +00:00
committed by Peter Steinberger
parent 7d8d8c338b
commit e91a5b0216
8 changed files with 650 additions and 46 deletions

View File

@@ -0,0 +1,106 @@
import type { Dirent } from "node:fs";
import fs from "node:fs/promises";
import path from "node:path";
import { cleanStaleLockFiles, type SessionLockInspection } from "../agents/session-write-lock.js";
import { resolveStateDir } from "../config/paths.js";
import { note } from "../terminal/note.js";
import { shortenHomePath } from "../utils.js";
const DEFAULT_STALE_MS = 30 * 60 * 1000;
async function resolveAgentSessionDirs(stateDir: string): Promise<string[]> {
const agentsDir = path.join(stateDir, "agents");
let entries: Dirent[] = [];
try {
entries = await fs.readdir(agentsDir, { withFileTypes: true });
} catch (err) {
const code = (err as { code?: string }).code;
if (code === "ENOENT") {
return [];
}
throw err;
}
return entries
.filter((entry) => entry.isDirectory())
.map((entry) => path.join(agentsDir, entry.name, "sessions"))
.toSorted((a, b) => a.localeCompare(b));
}
function formatAge(ageMs: number | null): string {
if (ageMs === null) {
return "unknown";
}
const seconds = Math.floor(ageMs / 1000);
if (seconds < 60) {
return `${seconds}s`;
}
const minutes = Math.floor(seconds / 60);
const remainingSeconds = seconds % 60;
if (minutes < 60) {
return `${minutes}m${remainingSeconds}s`;
}
const hours = Math.floor(minutes / 60);
const remainingMinutes = minutes % 60;
return `${hours}h${remainingMinutes}m`;
}
function formatLockLine(lock: SessionLockInspection): string {
const pidStatus =
lock.pid === null ? "pid=missing" : `pid=${lock.pid} (${lock.pidAlive ? "alive" : "dead"})`;
const ageStatus = `age=${formatAge(lock.ageMs)}`;
const staleStatus = lock.stale
? `stale=yes (${lock.staleReasons.join(", ") || "unknown"})`
: "stale=no";
const removedStatus = lock.removed ? " [removed]" : "";
return `- ${shortenHomePath(lock.lockPath)} ${pidStatus} ${ageStatus} ${staleStatus}${removedStatus}`;
}
export async function noteSessionLockHealth(params?: { shouldRepair?: boolean; staleMs?: number }) {
const shouldRepair = params?.shouldRepair === true;
const staleMs = params?.staleMs ?? DEFAULT_STALE_MS;
let sessionDirs: string[] = [];
try {
sessionDirs = await resolveAgentSessionDirs(resolveStateDir(process.env));
} catch (err) {
note(`- Failed to inspect session lock files: ${String(err)}`, "Session locks");
return;
}
if (sessionDirs.length === 0) {
return;
}
const allLocks: SessionLockInspection[] = [];
for (const sessionsDir of sessionDirs) {
const result = await cleanStaleLockFiles({
sessionsDir,
staleMs,
removeStale: shouldRepair,
});
allLocks.push(...result.locks);
}
if (allLocks.length === 0) {
return;
}
const staleCount = allLocks.filter((lock) => lock.stale).length;
const removedCount = allLocks.filter((lock) => lock.removed).length;
const lines: string[] = [
`- Found ${allLocks.length} session lock file${allLocks.length === 1 ? "" : "s"}.`,
...allLocks.toSorted((a, b) => a.lockPath.localeCompare(b.lockPath)).map(formatLockLine),
];
if (staleCount > 0 && !shouldRepair) {
lines.push(`- ${staleCount} lock file${staleCount === 1 ? " is" : "s are"} stale.`);
lines.push('- Run "openclaw doctor --fix" to remove stale lock files automatically.');
}
if (shouldRepair && removedCount > 0) {
lines.push(
`- Removed ${removedCount} stale session lock file${removedCount === 1 ? "" : "s"}.`,
);
}
note(lines.join("\n"), "Session locks");
}