fix: release stale session locks and add watchdog for hung API calls (#18060)

When a model API call hangs indefinitely (e.g. Anthropic quota exceeded mid-call), the gateway acquires a session .jsonl.lock but the promise never resolves, so the try/finally block never reaches release(). Since the owning PID is the gateway itself, stale detection cannot help — isPidAlive() always returns true. This commit adds four layers of defense: 1. **In-process lock watchdog** (session-write-lock.ts) - Track acquiredAt timestamp on each held lock - 60-second interval timer checks all held locks - Auto-releases any lock held longer than maxHoldMs (default 5 min) - Catches the hung-API-call case that try/finally cannot 2. **Gateway startup cleanup** (server-startup.ts) - On boot, scan all agent session directories for *.jsonl.lock files - Remove locks with dead PIDs or older than staleMs (30 min) - Log each cleaned lock for diagnostics 3. **openclaw doctor stale lock detection** (doctor-session-locks.ts) - New health check scans for .jsonl.lock files - Reports PID status and age of each lock found - In --fix mode, removes stale locks automatically 4. **Transcript error entry on API failure** (attempt.ts) - When promptError is set, write an error marker to the session transcript before releasing the lock - Preserves conversation history even on model API failures Closes #18060
2026-05-07 23:41:24 +00:00 · 2026-02-16 13:57:35 +00:00
parent 7d8d8c338b
commit e91a5b0216
8 changed files with 650 additions and 46 deletions
--- a/src/agents/pi-embedded-runner.e2e.test.ts
+++ b/src/agents/pi-embedded-runner.e2e.test.ts
@@ -73,6 +73,9 @@ vi.mock("@mariozechner/pi-ai", async () => {
      return buildAssistantMessage(model);
    },
    streamSimple: (model: { api: string; provider: string; id: string }) => {
+      if (model.id === "mock-throw") {
+        throw new Error("transport failed");
+      }
      const stream = new actual.AssistantMessageEventStream();
      queueMicrotask(() => {
        stream.push({
@@ -182,20 +185,21 @@ const textFromContent = (content: unknown) => {
  return undefined;
 };

-const readSessionMessages = async (sessionFile: string) => {
+const readSessionEntries = async (sessionFile: string) => {
  const raw = await fs.readFile(sessionFile, "utf-8");
  return raw
    .split(/\r?\n/)
    .filter(Boolean)
-    .map(
-      (line) =>
-        JSON.parse(line) as {
-          type?: string;
-          message?: { role?: string; content?: unknown };
-        },
-    )
+    .map((line) => JSON.parse(line) as { type?: string; customType?: string; data?: unknown });
+};
+
+const readSessionMessages = async (sessionFile: string) => {
+  const entries = await readSessionEntries(sessionFile);
+  return entries
    .filter((entry) => entry.type === "message")
-    .map((entry) => entry.message as { role?: string; content?: unknown });
+    .map(
+      (entry) => (entry as { message?: { role?: string; content?: unknown } }).message,
+    ) as Array<{ role?: string; content?: unknown }>;
 };

 const runDefaultEmbeddedTurn = async (sessionFile: string, prompt: string) => {
@@ -373,6 +377,35 @@ describe("runEmbeddedPiAgent", () => {
    expect(userIndex).toBeGreaterThanOrEqual(0);
  });

+  it("persists prompt transport errors as transcript entries", async () => {
+    const sessionFile = nextSessionFile();
+    const cfg = makeOpenAiConfig(["mock-throw"]);
+    await ensureModels(cfg);
+
+    const result = await runEmbeddedPiAgent({
+      sessionId: "session:test",
+      sessionKey: testSessionKey,
+      sessionFile,
+      workspaceDir,
+      config: cfg,
+      prompt: "transport error",
+      provider: "openai",
+      model: "mock-throw",
+      timeoutMs: 5_000,
+      agentDir,
+      enqueue: immediateEnqueue,
+    });
+    expect(result.payloads[0]?.isError).toBe(true);
+
+    const entries = await readSessionEntries(sessionFile);
+    const promptErrorEntry = entries.find(
+      (entry) => entry.type === "custom" && entry.customType === "openclaw:prompt-error",
+    ) as { data?: { error?: string } } | undefined;
+
+    expect(promptErrorEntry).toBeTruthy();
+    expect(promptErrorEntry?.data?.error).toContain("transport failed");
+  });
+
  it(
    "appends new user + assistant after existing transcript entries",
    { timeout: 90_000 },