fix: release stale session locks and add watchdog for hung API calls (#18060)

When a model API call hangs indefinitely (e.g. Anthropic quota exceeded
mid-call), the gateway acquires a session .jsonl.lock but the promise
never resolves, so the try/finally block never reaches release(). Since
the owning PID is the gateway itself, stale detection cannot help —
isPidAlive() always returns true.

This commit adds four layers of defense:

1. **In-process lock watchdog** (session-write-lock.ts)
   - Track acquiredAt timestamp on each held lock
   - 60-second interval timer checks all held locks
   - Auto-releases any lock held longer than maxHoldMs (default 5 min)
   - Catches the hung-API-call case that try/finally cannot

2. **Gateway startup cleanup** (server-startup.ts)
   - On boot, scan all agent session directories for *.jsonl.lock files
   - Remove locks with dead PIDs or older than staleMs (30 min)
   - Log each cleaned lock for diagnostics

3. **openclaw doctor stale lock detection** (doctor-session-locks.ts)
   - New health check scans for .jsonl.lock files
   - Reports PID status and age of each lock found
   - In --fix mode, removes stale locks automatically

4. **Transcript error entry on API failure** (attempt.ts)
   - When promptError is set, write an error marker to the session
     transcript before releasing the lock
   - Preserves conversation history even on model API failures

Closes #18060
This commit is contained in:
Vishal Doshi
2026-02-16 13:57:35 +00:00
committed by Peter Steinberger
parent 7d8d8c338b
commit e91a5b0216
8 changed files with 650 additions and 46 deletions

View File

@@ -1,8 +1,8 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import { __testing, acquireSessionWriteLock } from "./session-write-lock.js";
import { describe, expect, it, vi } from "vitest";
import { __testing, acquireSessionWriteLock, cleanStaleLockFiles } from "./session-write-lock.js";
describe("acquireSessionWriteLock", () => {
it("reuses locks across symlinked session paths", async () => {
@@ -72,6 +72,95 @@ describe("acquireSessionWriteLock", () => {
}
});
it("watchdog releases stale in-process locks", async () => {
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
try {
const sessionFile = path.join(root, "session.jsonl");
const lockPath = `${sessionFile}.lock`;
const lockA = await acquireSessionWriteLock({
sessionFile,
timeoutMs: 500,
maxHoldMs: 1,
});
const released = await __testing.runLockWatchdogCheck(Date.now() + 1000);
expect(released).toBeGreaterThanOrEqual(1);
await expect(fs.access(lockPath)).rejects.toThrow();
const lockB = await acquireSessionWriteLock({ sessionFile, timeoutMs: 500 });
await expect(fs.access(lockPath)).resolves.toBeUndefined();
// Old release handle must not affect the new lock.
await lockA.release();
await expect(fs.access(lockPath)).resolves.toBeUndefined();
await lockB.release();
await expect(fs.access(lockPath)).rejects.toThrow();
} finally {
warnSpy.mockRestore();
await fs.rm(root, { recursive: true, force: true });
}
});
it("cleans stale .jsonl lock files in sessions directories", async () => {
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
const sessionsDir = path.join(root, "sessions");
await fs.mkdir(sessionsDir, { recursive: true });
const nowMs = Date.now();
const staleDeadLock = path.join(sessionsDir, "dead.jsonl.lock");
const staleAliveLock = path.join(sessionsDir, "old-live.jsonl.lock");
const freshAliveLock = path.join(sessionsDir, "fresh-live.jsonl.lock");
try {
await fs.writeFile(
staleDeadLock,
JSON.stringify({
pid: 999_999,
createdAt: new Date(nowMs - 120_000).toISOString(),
}),
"utf8",
);
await fs.writeFile(
staleAliveLock,
JSON.stringify({
pid: process.pid,
createdAt: new Date(nowMs - 120_000).toISOString(),
}),
"utf8",
);
await fs.writeFile(
freshAliveLock,
JSON.stringify({
pid: process.pid,
createdAt: new Date(nowMs - 1_000).toISOString(),
}),
"utf8",
);
const result = await cleanStaleLockFiles({
sessionsDir,
staleMs: 30_000,
nowMs,
removeStale: true,
});
expect(result.locks).toHaveLength(3);
expect(result.cleaned).toHaveLength(2);
expect(result.cleaned.map((entry) => path.basename(entry.lockPath)).toSorted()).toEqual([
"dead.jsonl.lock",
"old-live.jsonl.lock",
]);
await expect(fs.access(staleDeadLock)).rejects.toThrow();
await expect(fs.access(staleAliveLock)).rejects.toThrow();
await expect(fs.access(freshAliveLock)).resolves.toBeUndefined();
} finally {
await fs.rm(root, { recursive: true, force: true });
}
});
it("removes held locks on termination signals", async () => {
const signals = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGABRT"] as const;
for (const signal of signals) {