mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-08 18:08:27 +00:00
fix: release stale session locks and add watchdog for hung API calls (#18060)
When a model API call hangs indefinitely (e.g. Anthropic quota exceeded
mid-call), the gateway acquires a session .jsonl.lock but the promise
never resolves, so the try/finally block never reaches release(). Since
the owning PID is the gateway itself, stale detection cannot help —
isPidAlive() always returns true.
This commit adds four layers of defense:
1. **In-process lock watchdog** (session-write-lock.ts)
- Track acquiredAt timestamp on each held lock
- 60-second interval timer checks all held locks
- Auto-releases any lock held longer than maxHoldMs (default 5 min)
- Catches the hung-API-call case that try/finally cannot
2. **Gateway startup cleanup** (server-startup.ts)
- On boot, scan all agent session directories for *.jsonl.lock files
- Remove locks with dead PIDs or older than staleMs (30 min)
- Log each cleaned lock for diagnostics
3. **openclaw doctor stale lock detection** (doctor-session-locks.ts)
- New health check scans for .jsonl.lock files
- Reports PID status and age of each lock found
- In --fix mode, removes stale locks automatically
4. **Transcript error entry on API failure** (attempt.ts)
- When promptError is set, write an error marker to the session
transcript before releasing the lock
- Preserves conversation history even on model API failures
Closes #18060
This commit is contained in:
committed by
Peter Steinberger
parent
7d8d8c338b
commit
e91a5b0216
@@ -848,6 +848,7 @@ export async function runEmbeddedAttempt(
|
||||
}).sessionAgentId;
|
||||
|
||||
let promptError: unknown = null;
|
||||
let promptErrorSource: "prompt" | "compaction" | null = null;
|
||||
try {
|
||||
const promptStartedAt = Date.now();
|
||||
|
||||
@@ -1000,6 +1001,7 @@ export async function runEmbeddedAttempt(
|
||||
}
|
||||
} catch (err) {
|
||||
promptError = err;
|
||||
promptErrorSource = "prompt";
|
||||
} finally {
|
||||
log.debug(
|
||||
`embedded run prompt end: runId=${params.runId} sessionId=${params.sessionId} durationMs=${Date.now() - promptStartedAt}`,
|
||||
@@ -1022,6 +1024,7 @@ export async function runEmbeddedAttempt(
|
||||
if (isRunnerAbortError(err)) {
|
||||
if (!promptError) {
|
||||
promptError = err;
|
||||
promptErrorSource = "compaction";
|
||||
}
|
||||
if (!isProbeSession) {
|
||||
log.debug(
|
||||
@@ -1070,6 +1073,23 @@ export async function runEmbeddedAttempt(
|
||||
}
|
||||
messagesSnapshot = snapshotSelection.messagesSnapshot;
|
||||
sessionIdUsed = snapshotSelection.sessionIdUsed;
|
||||
|
||||
if (promptError && promptErrorSource === "prompt") {
|
||||
try {
|
||||
sessionManager.appendCustomEntry("openclaw:prompt-error", {
|
||||
timestamp: Date.now(),
|
||||
runId: params.runId,
|
||||
sessionId: params.sessionId,
|
||||
provider: params.provider,
|
||||
model: params.modelId,
|
||||
api: params.model.api,
|
||||
error: describeUnknownError(promptError),
|
||||
});
|
||||
} catch (entryErr) {
|
||||
log.warn(`failed to persist prompt error entry: ${String(entryErr)}`);
|
||||
}
|
||||
}
|
||||
|
||||
cacheTrace?.recordStage("session:after", {
|
||||
messages: messagesSnapshot,
|
||||
note: timedOutDuringCompaction
|
||||
|
||||
Reference in New Issue
Block a user