mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-08 05:41:24 +00:00
fix: release stale session locks and add watchdog for hung API calls (#18060)
When a model API call hangs indefinitely (e.g. Anthropic quota exceeded
mid-call), the gateway acquires a session .jsonl.lock but the promise
never resolves, so the try/finally block never reaches release(). Since
the owning PID is the gateway itself, stale detection cannot help —
isPidAlive() always returns true.
This commit adds four layers of defense:
1. **In-process lock watchdog** (session-write-lock.ts)
- Track acquiredAt timestamp on each held lock
- 60-second interval timer checks all held locks
- Auto-releases any lock held longer than maxHoldMs (default 5 min)
- Catches the hung-API-call case that try/finally cannot
2. **Gateway startup cleanup** (server-startup.ts)
- On boot, scan all agent session directories for *.jsonl.lock files
- Remove locks with dead PIDs or older than staleMs (30 min)
- Log each cleaned lock for diagnostics
3. **openclaw doctor stale lock detection** (doctor-session-locks.ts)
- New health check scans for .jsonl.lock files
- Reports PID status and age of each lock found
- In --fix mode, removes stale locks automatically
4. **Transcript error entry on API failure** (attempt.ts)
- When promptError is set, write an error marker to the session
transcript before releasing the lock
- Preserves conversation history even on model API failures
Closes #18060
This commit is contained in:
committed by
Peter Steinberger
parent
7d8d8c338b
commit
e91a5b0216
@@ -1,8 +1,8 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { __testing, acquireSessionWriteLock } from "./session-write-lock.js";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import { __testing, acquireSessionWriteLock, cleanStaleLockFiles } from "./session-write-lock.js";
|
||||
|
||||
describe("acquireSessionWriteLock", () => {
|
||||
it("reuses locks across symlinked session paths", async () => {
|
||||
@@ -72,6 +72,95 @@ describe("acquireSessionWriteLock", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("watchdog releases stale in-process locks", async () => {
|
||||
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
|
||||
const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
|
||||
try {
|
||||
const sessionFile = path.join(root, "session.jsonl");
|
||||
const lockPath = `${sessionFile}.lock`;
|
||||
const lockA = await acquireSessionWriteLock({
|
||||
sessionFile,
|
||||
timeoutMs: 500,
|
||||
maxHoldMs: 1,
|
||||
});
|
||||
|
||||
const released = await __testing.runLockWatchdogCheck(Date.now() + 1000);
|
||||
expect(released).toBeGreaterThanOrEqual(1);
|
||||
await expect(fs.access(lockPath)).rejects.toThrow();
|
||||
|
||||
const lockB = await acquireSessionWriteLock({ sessionFile, timeoutMs: 500 });
|
||||
await expect(fs.access(lockPath)).resolves.toBeUndefined();
|
||||
|
||||
// Old release handle must not affect the new lock.
|
||||
await lockA.release();
|
||||
await expect(fs.access(lockPath)).resolves.toBeUndefined();
|
||||
|
||||
await lockB.release();
|
||||
await expect(fs.access(lockPath)).rejects.toThrow();
|
||||
} finally {
|
||||
warnSpy.mockRestore();
|
||||
await fs.rm(root, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("cleans stale .jsonl lock files in sessions directories", async () => {
|
||||
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-lock-"));
|
||||
const sessionsDir = path.join(root, "sessions");
|
||||
await fs.mkdir(sessionsDir, { recursive: true });
|
||||
|
||||
const nowMs = Date.now();
|
||||
const staleDeadLock = path.join(sessionsDir, "dead.jsonl.lock");
|
||||
const staleAliveLock = path.join(sessionsDir, "old-live.jsonl.lock");
|
||||
const freshAliveLock = path.join(sessionsDir, "fresh-live.jsonl.lock");
|
||||
|
||||
try {
|
||||
await fs.writeFile(
|
||||
staleDeadLock,
|
||||
JSON.stringify({
|
||||
pid: 999_999,
|
||||
createdAt: new Date(nowMs - 120_000).toISOString(),
|
||||
}),
|
||||
"utf8",
|
||||
);
|
||||
await fs.writeFile(
|
||||
staleAliveLock,
|
||||
JSON.stringify({
|
||||
pid: process.pid,
|
||||
createdAt: new Date(nowMs - 120_000).toISOString(),
|
||||
}),
|
||||
"utf8",
|
||||
);
|
||||
await fs.writeFile(
|
||||
freshAliveLock,
|
||||
JSON.stringify({
|
||||
pid: process.pid,
|
||||
createdAt: new Date(nowMs - 1_000).toISOString(),
|
||||
}),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
const result = await cleanStaleLockFiles({
|
||||
sessionsDir,
|
||||
staleMs: 30_000,
|
||||
nowMs,
|
||||
removeStale: true,
|
||||
});
|
||||
|
||||
expect(result.locks).toHaveLength(3);
|
||||
expect(result.cleaned).toHaveLength(2);
|
||||
expect(result.cleaned.map((entry) => path.basename(entry.lockPath)).toSorted()).toEqual([
|
||||
"dead.jsonl.lock",
|
||||
"old-live.jsonl.lock",
|
||||
]);
|
||||
|
||||
await expect(fs.access(staleDeadLock)).rejects.toThrow();
|
||||
await expect(fs.access(staleAliveLock)).rejects.toThrow();
|
||||
await expect(fs.access(freshAliveLock)).resolves.toBeUndefined();
|
||||
} finally {
|
||||
await fs.rm(root, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("removes held locks on termination signals", async () => {
|
||||
const signals = ["SIGINT", "SIGTERM", "SIGQUIT", "SIGABRT"] as const;
|
||||
for (const signal of signals) {
|
||||
|
||||
Reference in New Issue
Block a user