fix: release stale session locks and add watchdog for hung API calls (#18060)

When a model API call hangs indefinitely (e.g. Anthropic quota exceeded mid-call), the gateway acquires a session .jsonl.lock but the promise never resolves, so the try/finally block never reaches release(). Since the owning PID is the gateway itself, stale detection cannot help — isPidAlive() always returns true. This commit adds four layers of defense: 1. **In-process lock watchdog** (session-write-lock.ts) - Track acquiredAt timestamp on each held lock - 60-second interval timer checks all held locks - Auto-releases any lock held longer than maxHoldMs (default 5 min) - Catches the hung-API-call case that try/finally cannot 2. **Gateway startup cleanup** (server-startup.ts) - On boot, scan all agent session directories for *.jsonl.lock files - Remove locks with dead PIDs or older than staleMs (30 min) - Log each cleaned lock for diagnostics 3. **openclaw doctor stale lock detection** (doctor-session-locks.ts) - New health check scans for .jsonl.lock files - Reports PID status and age of each lock found - In --fix mode, removes stale locks automatically 4. **Transcript error entry on API failure** (attempt.ts) - When promptError is set, write an error marker to the session transcript before releasing the lock - Preserves conversation history even on model API failures Closes #18060
2026-05-08 18:08:27 +00:00 · 2026-02-16 13:57:35 +00:00
parent 7d8d8c338b
commit e91a5b0216
8 changed files with 650 additions and 46 deletions
--- a/src/agents/pi-embedded-runner/run/attempt.ts
+++ b/src/agents/pi-embedded-runner/run/attempt.ts
@@ -848,6 +848,7 @@ export async function runEmbeddedAttempt(
            }).sessionAgentId;

      let promptError: unknown = null;
+      let promptErrorSource: "prompt" | "compaction" | null = null;
      try {
        const promptStartedAt = Date.now();

@@ -1000,6 +1001,7 @@ export async function runEmbeddedAttempt(
          }
        } catch (err) {
          promptError = err;
+          promptErrorSource = "prompt";
        } finally {
          log.debug(
            `embedded run prompt end: runId=${params.runId} sessionId=${params.sessionId} durationMs=${Date.now() - promptStartedAt}`,
@@ -1022,6 +1024,7 @@ export async function runEmbeddedAttempt(
          if (isRunnerAbortError(err)) {
            if (!promptError) {
              promptError = err;
+              promptErrorSource = "compaction";
            }
            if (!isProbeSession) {
              log.debug(
@@ -1070,6 +1073,23 @@ export async function runEmbeddedAttempt(
        }
        messagesSnapshot = snapshotSelection.messagesSnapshot;
        sessionIdUsed = snapshotSelection.sessionIdUsed;
+
+        if (promptError && promptErrorSource === "prompt") {
+          try {
+            sessionManager.appendCustomEntry("openclaw:prompt-error", {
+              timestamp: Date.now(),
+              runId: params.runId,
+              sessionId: params.sessionId,
+              provider: params.provider,
+              model: params.modelId,
+              api: params.model.api,
+              error: describeUnknownError(promptError),
+            });
+          } catch (entryErr) {
+            log.warn(`failed to persist prompt error entry: ${String(entryErr)}`);
+          }
+        }
+
        cacheTrace?.recordStage("session:after", {
          messages: messagesSnapshot,
          note: timedOutDuringCompaction