mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-07 23:41:24 +00:00
fix: release stale session locks and add watchdog for hung API calls (#18060)
When a model API call hangs indefinitely (e.g. Anthropic quota exceeded
mid-call), the gateway acquires a session .jsonl.lock but the promise
never resolves, so the try/finally block never reaches release(). Since
the owning PID is the gateway itself, stale detection cannot help —
isPidAlive() always returns true.
This commit adds four layers of defense:
1. **In-process lock watchdog** (session-write-lock.ts)
- Track acquiredAt timestamp on each held lock
- 60-second interval timer checks all held locks
- Auto-releases any lock held longer than maxHoldMs (default 5 min)
- Catches the hung-API-call case that try/finally cannot
2. **Gateway startup cleanup** (server-startup.ts)
- On boot, scan all agent session directories for *.jsonl.lock files
- Remove locks with dead PIDs or older than staleMs (30 min)
- Log each cleaned lock for diagnostics
3. **openclaw doctor stale lock detection** (doctor-session-locks.ts)
- New health check scans for .jsonl.lock files
- Reports PID status and age of each lock found
- In --fix mode, removes stale locks automatically
4. **Transcript error entry on API failure** (attempt.ts)
- When promptError is set, write an error marker to the session
transcript before releasing the lock
- Preserves conversation history even on model API failures
Closes #18060
This commit is contained in:
committed by
Peter Steinberger
parent
7d8d8c338b
commit
e91a5b0216
@@ -73,6 +73,9 @@ vi.mock("@mariozechner/pi-ai", async () => {
|
||||
return buildAssistantMessage(model);
|
||||
},
|
||||
streamSimple: (model: { api: string; provider: string; id: string }) => {
|
||||
if (model.id === "mock-throw") {
|
||||
throw new Error("transport failed");
|
||||
}
|
||||
const stream = new actual.AssistantMessageEventStream();
|
||||
queueMicrotask(() => {
|
||||
stream.push({
|
||||
@@ -182,20 +185,21 @@ const textFromContent = (content: unknown) => {
|
||||
return undefined;
|
||||
};
|
||||
|
||||
const readSessionMessages = async (sessionFile: string) => {
|
||||
const readSessionEntries = async (sessionFile: string) => {
|
||||
const raw = await fs.readFile(sessionFile, "utf-8");
|
||||
return raw
|
||||
.split(/\r?\n/)
|
||||
.filter(Boolean)
|
||||
.map(
|
||||
(line) =>
|
||||
JSON.parse(line) as {
|
||||
type?: string;
|
||||
message?: { role?: string; content?: unknown };
|
||||
},
|
||||
)
|
||||
.map((line) => JSON.parse(line) as { type?: string; customType?: string; data?: unknown });
|
||||
};
|
||||
|
||||
const readSessionMessages = async (sessionFile: string) => {
|
||||
const entries = await readSessionEntries(sessionFile);
|
||||
return entries
|
||||
.filter((entry) => entry.type === "message")
|
||||
.map((entry) => entry.message as { role?: string; content?: unknown });
|
||||
.map(
|
||||
(entry) => (entry as { message?: { role?: string; content?: unknown } }).message,
|
||||
) as Array<{ role?: string; content?: unknown }>;
|
||||
};
|
||||
|
||||
const runDefaultEmbeddedTurn = async (sessionFile: string, prompt: string) => {
|
||||
@@ -373,6 +377,35 @@ describe("runEmbeddedPiAgent", () => {
|
||||
expect(userIndex).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
|
||||
it("persists prompt transport errors as transcript entries", async () => {
|
||||
const sessionFile = nextSessionFile();
|
||||
const cfg = makeOpenAiConfig(["mock-throw"]);
|
||||
await ensureModels(cfg);
|
||||
|
||||
const result = await runEmbeddedPiAgent({
|
||||
sessionId: "session:test",
|
||||
sessionKey: testSessionKey,
|
||||
sessionFile,
|
||||
workspaceDir,
|
||||
config: cfg,
|
||||
prompt: "transport error",
|
||||
provider: "openai",
|
||||
model: "mock-throw",
|
||||
timeoutMs: 5_000,
|
||||
agentDir,
|
||||
enqueue: immediateEnqueue,
|
||||
});
|
||||
expect(result.payloads[0]?.isError).toBe(true);
|
||||
|
||||
const entries = await readSessionEntries(sessionFile);
|
||||
const promptErrorEntry = entries.find(
|
||||
(entry) => entry.type === "custom" && entry.customType === "openclaw:prompt-error",
|
||||
) as { data?: { error?: string } } | undefined;
|
||||
|
||||
expect(promptErrorEntry).toBeTruthy();
|
||||
expect(promptErrorEntry?.data?.error).toContain("transport failed");
|
||||
});
|
||||
|
||||
it(
|
||||
"appends new user + assistant after existing transcript entries",
|
||||
{ timeout: 90_000 },
|
||||
|
||||
Reference in New Issue
Block a user