mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-08 07:21:23 +00:00
fix: release stale session locks and add watchdog for hung API calls (#18060)
When a model API call hangs indefinitely (e.g. Anthropic quota exceeded
mid-call), the gateway acquires a session .jsonl.lock but the promise
never resolves, so the try/finally block never reaches release(). Since
the owning PID is the gateway itself, stale detection cannot help —
isPidAlive() always returns true.
This commit adds four layers of defense:
1. **In-process lock watchdog** (session-write-lock.ts)
- Track acquiredAt timestamp on each held lock
- 60-second interval timer checks all held locks
- Auto-releases any lock held longer than maxHoldMs (default 5 min)
- Catches the hung-API-call case that try/finally cannot
2. **Gateway startup cleanup** (server-startup.ts)
- On boot, scan all agent session directories for *.jsonl.lock files
- Remove locks with dead PIDs or older than staleMs (30 min)
- Log each cleaned lock for diagnostics
3. **openclaw doctor stale lock detection** (doctor-session-locks.ts)
- New health check scans for .jsonl.lock files
- Reports PID status and age of each lock found
- In --fix mode, removes stale locks automatically
4. **Transcript error entry on API failure** (attempt.ts)
- When promptError is set, write an error marker to the session
transcript before releasing the lock
- Preserves conversation history even on model API failures
Closes #18060
This commit is contained in:
committed by
Peter Steinberger
parent
7d8d8c338b
commit
e91a5b0216
@@ -1,3 +1,6 @@
|
||||
import type { Dirent } from "node:fs";
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import type { CliDeps } from "../cli/deps.js";
|
||||
import type { loadConfig } from "../config/config.js";
|
||||
import type { loadOpenClawPlugins } from "../plugins/loader.js";
|
||||
@@ -8,6 +11,8 @@ import {
|
||||
resolveConfiguredModelRef,
|
||||
resolveHooksGmailModel,
|
||||
} from "../agents/model-selection.js";
|
||||
import { cleanStaleLockFiles } from "../agents/session-write-lock.js";
|
||||
import { resolveStateDir } from "../config/paths.js";
|
||||
import { startGmailWatcher } from "../hooks/gmail-watcher.js";
|
||||
import {
|
||||
clearInternalHooks,
|
||||
@@ -24,6 +29,27 @@ import {
|
||||
} from "./server-restart-sentinel.js";
|
||||
import { startGatewayMemoryBackend } from "./server-startup-memory.js";
|
||||
|
||||
const SESSION_LOCK_STALE_MS = 30 * 60 * 1000;
|
||||
|
||||
async function resolveAgentSessionDirs(stateDir: string): Promise<string[]> {
|
||||
const agentsDir = path.join(stateDir, "agents");
|
||||
let entries: Dirent[] = [];
|
||||
try {
|
||||
entries = await fs.readdir(agentsDir, { withFileTypes: true });
|
||||
} catch (err) {
|
||||
const code = (err as { code?: string }).code;
|
||||
if (code === "ENOENT") {
|
||||
return [];
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
|
||||
return entries
|
||||
.filter((entry) => entry.isDirectory())
|
||||
.map((entry) => path.join(agentsDir, entry.name, "sessions"))
|
||||
.toSorted((a, b) => a.localeCompare(b));
|
||||
}
|
||||
|
||||
export async function startGatewaySidecars(params: {
|
||||
cfg: ReturnType<typeof loadConfig>;
|
||||
pluginRegistry: ReturnType<typeof loadOpenClawPlugins>;
|
||||
@@ -39,6 +65,21 @@ export async function startGatewaySidecars(params: {
|
||||
logChannels: { info: (msg: string) => void; error: (msg: string) => void };
|
||||
logBrowser: { error: (msg: string) => void };
|
||||
}) {
|
||||
try {
|
||||
const stateDir = resolveStateDir(process.env);
|
||||
const sessionDirs = await resolveAgentSessionDirs(stateDir);
|
||||
for (const sessionsDir of sessionDirs) {
|
||||
await cleanStaleLockFiles({
|
||||
sessionsDir,
|
||||
staleMs: SESSION_LOCK_STALE_MS,
|
||||
removeStale: true,
|
||||
log: { warn: (message) => params.log.warn(message) },
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
params.log.warn(`session lock cleanup failed on startup: ${String(err)}`);
|
||||
}
|
||||
|
||||
// Start OpenClaw browser control server (unless disabled via config).
|
||||
let browserControl: Awaited<ReturnType<typeof startBrowserControlServerIfEnabled>> = null;
|
||||
try {
|
||||
|
||||
Reference in New Issue
Block a user