mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-07 15:51:22 +00:00
fix: release stale session locks and add watchdog for hung API calls (#18060)
When a model API call hangs indefinitely (e.g. Anthropic quota exceeded
mid-call), the gateway acquires a session .jsonl.lock but the promise
never resolves, so the try/finally block never reaches release(). Since
the owning PID is the gateway itself, stale detection cannot help —
isPidAlive() always returns true.
This commit adds four layers of defense:
1. **In-process lock watchdog** (session-write-lock.ts)
- Track acquiredAt timestamp on each held lock
- 60-second interval timer checks all held locks
- Auto-releases any lock held longer than maxHoldMs (default 5 min)
- Catches the hung-API-call case that try/finally cannot
2. **Gateway startup cleanup** (server-startup.ts)
- On boot, scan all agent session directories for *.jsonl.lock files
- Remove locks with dead PIDs or older than staleMs (30 min)
- Log each cleaned lock for diagnostics
3. **openclaw doctor stale lock detection** (doctor-session-locks.ts)
- New health check scans for .jsonl.lock files
- Reports PID status and age of each lock found
- In --fix mode, removes stale locks automatically
4. **Transcript error entry on API failure** (attempt.ts)
- When promptError is set, write an error marker to the session
transcript before releasing the lock
- Preserves conversation history even on model API failures
Closes #18060
This commit is contained in:
committed by
Peter Steinberger
parent
7d8d8c338b
commit
e91a5b0216
83
src/commands/doctor-session-locks.test.ts
Normal file
83
src/commands/doctor-session-locks.test.ts
Normal file
@@ -0,0 +1,83 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const note = vi.hoisted(() => vi.fn());
|
||||
|
||||
vi.mock("../terminal/note.js", () => ({
|
||||
note,
|
||||
}));
|
||||
|
||||
import { noteSessionLockHealth } from "./doctor-session-locks.js";
|
||||
|
||||
describe("noteSessionLockHealth", () => {
|
||||
let root: string;
|
||||
let prevStateDir: string | undefined;
|
||||
|
||||
beforeEach(async () => {
|
||||
note.mockReset();
|
||||
prevStateDir = process.env.OPENCLAW_STATE_DIR;
|
||||
root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-doctor-locks-"));
|
||||
process.env.OPENCLAW_STATE_DIR = root;
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
if (prevStateDir === undefined) {
|
||||
delete process.env.OPENCLAW_STATE_DIR;
|
||||
} else {
|
||||
process.env.OPENCLAW_STATE_DIR = prevStateDir;
|
||||
}
|
||||
await fs.rm(root, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it("reports existing lock files with pid status and age", async () => {
|
||||
const sessionsDir = path.join(root, "agents", "main", "sessions");
|
||||
await fs.mkdir(sessionsDir, { recursive: true });
|
||||
const lockPath = path.join(sessionsDir, "active.jsonl.lock");
|
||||
await fs.writeFile(
|
||||
lockPath,
|
||||
JSON.stringify({ pid: process.pid, createdAt: new Date(Date.now() - 1500).toISOString() }),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
await noteSessionLockHealth({ shouldRepair: false, staleMs: 60_000 });
|
||||
|
||||
expect(note).toHaveBeenCalledTimes(1);
|
||||
const [message, title] = note.mock.calls[0] as [string, string];
|
||||
expect(title).toBe("Session locks");
|
||||
expect(message).toContain("Found 1 session lock file");
|
||||
expect(message).toContain(`pid=${process.pid} (alive)`);
|
||||
expect(message).toContain("stale=no");
|
||||
await expect(fs.access(lockPath)).resolves.toBeUndefined();
|
||||
});
|
||||
|
||||
it("removes stale locks in repair mode", async () => {
|
||||
const sessionsDir = path.join(root, "agents", "main", "sessions");
|
||||
await fs.mkdir(sessionsDir, { recursive: true });
|
||||
|
||||
const staleLock = path.join(sessionsDir, "stale.jsonl.lock");
|
||||
const freshLock = path.join(sessionsDir, "fresh.jsonl.lock");
|
||||
|
||||
await fs.writeFile(
|
||||
staleLock,
|
||||
JSON.stringify({ pid: -1, createdAt: new Date(Date.now() - 120_000).toISOString() }),
|
||||
"utf8",
|
||||
);
|
||||
await fs.writeFile(
|
||||
freshLock,
|
||||
JSON.stringify({ pid: process.pid, createdAt: new Date().toISOString() }),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
await noteSessionLockHealth({ shouldRepair: true, staleMs: 30_000 });
|
||||
|
||||
expect(note).toHaveBeenCalledTimes(1);
|
||||
const [message] = note.mock.calls[0] as [string, string];
|
||||
expect(message).toContain("[removed]");
|
||||
expect(message).toContain("Removed 1 stale session lock file");
|
||||
|
||||
await expect(fs.access(staleLock)).rejects.toThrow();
|
||||
await expect(fs.access(freshLock)).resolves.toBeUndefined();
|
||||
});
|
||||
});
|
||||
106
src/commands/doctor-session-locks.ts
Normal file
106
src/commands/doctor-session-locks.ts
Normal file
@@ -0,0 +1,106 @@
|
||||
import type { Dirent } from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { cleanStaleLockFiles, type SessionLockInspection } from "../agents/session-write-lock.js";
|
||||
import { resolveStateDir } from "../config/paths.js";
|
||||
import { note } from "../terminal/note.js";
|
||||
import { shortenHomePath } from "../utils.js";
|
||||
|
||||
const DEFAULT_STALE_MS = 30 * 60 * 1000;
|
||||
|
||||
async function resolveAgentSessionDirs(stateDir: string): Promise<string[]> {
|
||||
const agentsDir = path.join(stateDir, "agents");
|
||||
let entries: Dirent[] = [];
|
||||
try {
|
||||
entries = await fs.readdir(agentsDir, { withFileTypes: true });
|
||||
} catch (err) {
|
||||
const code = (err as { code?: string }).code;
|
||||
if (code === "ENOENT") {
|
||||
return [];
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
|
||||
return entries
|
||||
.filter((entry) => entry.isDirectory())
|
||||
.map((entry) => path.join(agentsDir, entry.name, "sessions"))
|
||||
.toSorted((a, b) => a.localeCompare(b));
|
||||
}
|
||||
|
||||
function formatAge(ageMs: number | null): string {
|
||||
if (ageMs === null) {
|
||||
return "unknown";
|
||||
}
|
||||
const seconds = Math.floor(ageMs / 1000);
|
||||
if (seconds < 60) {
|
||||
return `${seconds}s`;
|
||||
}
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
const remainingSeconds = seconds % 60;
|
||||
if (minutes < 60) {
|
||||
return `${minutes}m${remainingSeconds}s`;
|
||||
}
|
||||
const hours = Math.floor(minutes / 60);
|
||||
const remainingMinutes = minutes % 60;
|
||||
return `${hours}h${remainingMinutes}m`;
|
||||
}
|
||||
|
||||
function formatLockLine(lock: SessionLockInspection): string {
|
||||
const pidStatus =
|
||||
lock.pid === null ? "pid=missing" : `pid=${lock.pid} (${lock.pidAlive ? "alive" : "dead"})`;
|
||||
const ageStatus = `age=${formatAge(lock.ageMs)}`;
|
||||
const staleStatus = lock.stale
|
||||
? `stale=yes (${lock.staleReasons.join(", ") || "unknown"})`
|
||||
: "stale=no";
|
||||
const removedStatus = lock.removed ? " [removed]" : "";
|
||||
return `- ${shortenHomePath(lock.lockPath)} ${pidStatus} ${ageStatus} ${staleStatus}${removedStatus}`;
|
||||
}
|
||||
|
||||
export async function noteSessionLockHealth(params?: { shouldRepair?: boolean; staleMs?: number }) {
|
||||
const shouldRepair = params?.shouldRepair === true;
|
||||
const staleMs = params?.staleMs ?? DEFAULT_STALE_MS;
|
||||
let sessionDirs: string[] = [];
|
||||
try {
|
||||
sessionDirs = await resolveAgentSessionDirs(resolveStateDir(process.env));
|
||||
} catch (err) {
|
||||
note(`- Failed to inspect session lock files: ${String(err)}`, "Session locks");
|
||||
return;
|
||||
}
|
||||
|
||||
if (sessionDirs.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const allLocks: SessionLockInspection[] = [];
|
||||
for (const sessionsDir of sessionDirs) {
|
||||
const result = await cleanStaleLockFiles({
|
||||
sessionsDir,
|
||||
staleMs,
|
||||
removeStale: shouldRepair,
|
||||
});
|
||||
allLocks.push(...result.locks);
|
||||
}
|
||||
|
||||
if (allLocks.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const staleCount = allLocks.filter((lock) => lock.stale).length;
|
||||
const removedCount = allLocks.filter((lock) => lock.removed).length;
|
||||
const lines: string[] = [
|
||||
`- Found ${allLocks.length} session lock file${allLocks.length === 1 ? "" : "s"}.`,
|
||||
...allLocks.toSorted((a, b) => a.lockPath.localeCompare(b.lockPath)).map(formatLockLine),
|
||||
];
|
||||
|
||||
if (staleCount > 0 && !shouldRepair) {
|
||||
lines.push(`- ${staleCount} lock file${staleCount === 1 ? " is" : "s are"} stale.`);
|
||||
lines.push('- Run "openclaw doctor --fix" to remove stale lock files automatically.');
|
||||
}
|
||||
if (shouldRepair && removedCount > 0) {
|
||||
lines.push(
|
||||
`- Removed ${removedCount} stale session lock file${removedCount === 1 ? "" : "s"}.`,
|
||||
);
|
||||
}
|
||||
|
||||
note(lines.join("\n"), "Session locks");
|
||||
}
|
||||
@@ -44,6 +44,7 @@ import {
|
||||
import { createDoctorPrompter, type DoctorOptions } from "./doctor-prompter.js";
|
||||
import { maybeRepairSandboxImages, noteSandboxScopeWarnings } from "./doctor-sandbox.js";
|
||||
import { noteSecurityWarnings } from "./doctor-security.js";
|
||||
import { noteSessionLockHealth } from "./doctor-session-locks.js";
|
||||
import { noteStateIntegrity, noteWorkspaceBackupTip } from "./doctor-state-integrity.js";
|
||||
import {
|
||||
detectLegacyStateMigrations,
|
||||
@@ -188,6 +189,7 @@ export async function doctorCommand(
|
||||
}
|
||||
|
||||
await noteStateIntegrity(cfg, prompter, configResult.path ?? CONFIG_PATH);
|
||||
await noteSessionLockHealth({ shouldRepair: prompter.shouldRepair });
|
||||
|
||||
cfg = await maybeRepairSandboxImages(cfg, runtime, prompter);
|
||||
noteSandboxScopeWarnings(cfg);
|
||||
|
||||
Reference in New Issue
Block a user