mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-08 13:11:22 +00:00
fix: codex and similar processes keep dying on pty, solved by refactoring process spawning (#14257)
* exec: clean up PTY resources on timeout and exit * cli: harden resume cleanup and watchdog stalled runs * cli: productionize PTY and resume reliability paths * docs: add PTY process supervision architecture plan * docs: rewrite PTY supervision plan as pre-rewrite baseline * docs: switch PTY supervision plan to one-go execution * docs: add one-line root cause to PTY supervision plan * docs: add OS contracts and test matrix to PTY supervision plan * docs: define process-supervisor package placement and scope * docs: tie supervisor plan to existing CI lanes * docs: place PTY supervisor plan under src/process * refactor(process): route exec and cli runs through supervisor * docs(process): refresh PTY supervision plan * wip * fix(process): harden supervisor timeout and PTY termination * fix(process): harden supervisor adapters env and wait handling * ci: avoid failing formal conformance on comment permissions * test(ui): fix cron request mock argument typing * fix(ui): remove leftover conflict marker * fix: supervise PTY processes (#14257) (openclaw#14257) (thanks @onutc)
This commit is contained in:
@@ -3,50 +3,69 @@ import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import type { CliBackendConfig } from "../config/types.js";
|
||||
import { runCliAgent } from "./cli-runner.js";
|
||||
import { cleanupResumeProcesses, cleanupSuspendedCliProcesses } from "./cli-runner/helpers.js";
|
||||
import { resolveCliNoOutputTimeoutMs } from "./cli-runner/helpers.js";
|
||||
|
||||
const runCommandWithTimeoutMock = vi.fn();
|
||||
const runExecMock = vi.fn();
|
||||
const supervisorSpawnMock = vi.fn();
|
||||
|
||||
vi.mock("../process/exec.js", () => ({
|
||||
runCommandWithTimeout: (...args: unknown[]) => runCommandWithTimeoutMock(...args),
|
||||
runExec: (...args: unknown[]) => runExecMock(...args),
|
||||
vi.mock("../process/supervisor/index.js", () => ({
|
||||
getProcessSupervisor: () => ({
|
||||
spawn: (...args: unknown[]) => supervisorSpawnMock(...args),
|
||||
cancel: vi.fn(),
|
||||
cancelScope: vi.fn(),
|
||||
reconcileOrphans: vi.fn(),
|
||||
getRecord: vi.fn(),
|
||||
}),
|
||||
}));
|
||||
|
||||
describe("runCliAgent resume cleanup", () => {
|
||||
type MockRunExit = {
|
||||
reason:
|
||||
| "manual-cancel"
|
||||
| "overall-timeout"
|
||||
| "no-output-timeout"
|
||||
| "spawn-error"
|
||||
| "signal"
|
||||
| "exit";
|
||||
exitCode: number | null;
|
||||
exitSignal: NodeJS.Signals | number | null;
|
||||
durationMs: number;
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
timedOut: boolean;
|
||||
noOutputTimedOut: boolean;
|
||||
};
|
||||
|
||||
function createManagedRun(exit: MockRunExit, pid = 1234) {
|
||||
return {
|
||||
runId: "run-supervisor",
|
||||
pid,
|
||||
startedAtMs: Date.now(),
|
||||
stdin: undefined,
|
||||
wait: vi.fn().mockResolvedValue(exit),
|
||||
cancel: vi.fn(),
|
||||
};
|
||||
}
|
||||
|
||||
describe("runCliAgent with process supervisor", () => {
|
||||
beforeEach(() => {
|
||||
runCommandWithTimeoutMock.mockReset();
|
||||
runExecMock.mockReset();
|
||||
supervisorSpawnMock.mockReset();
|
||||
});
|
||||
|
||||
it("kills stale resume processes for codex sessions", async () => {
|
||||
const selfPid = process.pid;
|
||||
|
||||
runExecMock
|
||||
.mockResolvedValueOnce({
|
||||
stdout: " 1 999 S /bin/launchd\n",
|
||||
it("runs CLI through supervisor and returns payload", async () => {
|
||||
supervisorSpawnMock.mockResolvedValueOnce(
|
||||
createManagedRun({
|
||||
reason: "exit",
|
||||
exitCode: 0,
|
||||
exitSignal: null,
|
||||
durationMs: 50,
|
||||
stdout: "ok",
|
||||
stderr: "",
|
||||
}) // cleanupSuspendedCliProcesses (ps) — ppid 999 != selfPid, no match
|
||||
.mockResolvedValueOnce({
|
||||
stdout: [
|
||||
` ${selfPid + 1} ${selfPid} codex exec resume thread-123 --color never --sandbox read-only --skip-git-repo-check`,
|
||||
` ${selfPid + 2} 999 codex exec resume thread-123 --color never --sandbox read-only --skip-git-repo-check`,
|
||||
].join("\n"),
|
||||
stderr: "",
|
||||
}) // cleanupResumeProcesses (ps)
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" }) // cleanupResumeProcesses (kill -TERM)
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" }); // cleanupResumeProcesses (kill -9)
|
||||
runCommandWithTimeoutMock.mockResolvedValueOnce({
|
||||
stdout: "ok",
|
||||
stderr: "",
|
||||
code: 0,
|
||||
signal: null,
|
||||
killed: false,
|
||||
});
|
||||
timedOut: false,
|
||||
noOutputTimedOut: false,
|
||||
}),
|
||||
);
|
||||
|
||||
await runCliAgent({
|
||||
const result = await runCliAgent({
|
||||
sessionId: "s1",
|
||||
sessionFile: "/tmp/session.jsonl",
|
||||
workspaceDir: "/tmp",
|
||||
@@ -58,28 +77,80 @@ describe("runCliAgent resume cleanup", () => {
|
||||
cliSessionId: "thread-123",
|
||||
});
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
expect(result.payloads?.[0]?.text).toBe("ok");
|
||||
expect(supervisorSpawnMock).toHaveBeenCalledTimes(1);
|
||||
const input = supervisorSpawnMock.mock.calls[0]?.[0] as {
|
||||
argv?: string[];
|
||||
mode?: string;
|
||||
timeoutMs?: number;
|
||||
noOutputTimeoutMs?: number;
|
||||
replaceExistingScope?: boolean;
|
||||
scopeKey?: string;
|
||||
};
|
||||
expect(input.mode).toBe("child");
|
||||
expect(input.argv?.[0]).toBe("codex");
|
||||
expect(input.timeoutMs).toBe(1_000);
|
||||
expect(input.noOutputTimeoutMs).toBeGreaterThanOrEqual(1_000);
|
||||
expect(input.replaceExistingScope).toBe(true);
|
||||
expect(input.scopeKey).toContain("thread-123");
|
||||
});
|
||||
|
||||
expect(runExecMock).toHaveBeenCalledTimes(4);
|
||||
it("fails with timeout when no-output watchdog trips", async () => {
|
||||
supervisorSpawnMock.mockResolvedValueOnce(
|
||||
createManagedRun({
|
||||
reason: "no-output-timeout",
|
||||
exitCode: null,
|
||||
exitSignal: "SIGKILL",
|
||||
durationMs: 200,
|
||||
stdout: "",
|
||||
stderr: "",
|
||||
timedOut: true,
|
||||
noOutputTimedOut: true,
|
||||
}),
|
||||
);
|
||||
|
||||
// Second call: cleanupResumeProcesses ps
|
||||
const psCall = runExecMock.mock.calls[1] ?? [];
|
||||
expect(psCall[0]).toBe("ps");
|
||||
await expect(
|
||||
runCliAgent({
|
||||
sessionId: "s1",
|
||||
sessionFile: "/tmp/session.jsonl",
|
||||
workspaceDir: "/tmp",
|
||||
prompt: "hi",
|
||||
provider: "codex-cli",
|
||||
model: "gpt-5.2-codex",
|
||||
timeoutMs: 1_000,
|
||||
runId: "run-2",
|
||||
cliSessionId: "thread-123",
|
||||
}),
|
||||
).rejects.toThrow("produced no output");
|
||||
});
|
||||
|
||||
// Third call: TERM, only the child PID
|
||||
const termCall = runExecMock.mock.calls[2] ?? [];
|
||||
expect(termCall[0]).toBe("kill");
|
||||
const termArgs = termCall[1] as string[];
|
||||
expect(termArgs).toEqual(["-TERM", String(selfPid + 1)]);
|
||||
it("fails with timeout when overall timeout trips", async () => {
|
||||
supervisorSpawnMock.mockResolvedValueOnce(
|
||||
createManagedRun({
|
||||
reason: "overall-timeout",
|
||||
exitCode: null,
|
||||
exitSignal: "SIGKILL",
|
||||
durationMs: 200,
|
||||
stdout: "",
|
||||
stderr: "",
|
||||
timedOut: true,
|
||||
noOutputTimedOut: false,
|
||||
}),
|
||||
);
|
||||
|
||||
// Fourth call: KILL, only the child PID
|
||||
const killCall = runExecMock.mock.calls[3] ?? [];
|
||||
expect(killCall[0]).toBe("kill");
|
||||
const killArgs = killCall[1] as string[];
|
||||
expect(killArgs).toEqual(["-9", String(selfPid + 1)]);
|
||||
await expect(
|
||||
runCliAgent({
|
||||
sessionId: "s1",
|
||||
sessionFile: "/tmp/session.jsonl",
|
||||
workspaceDir: "/tmp",
|
||||
prompt: "hi",
|
||||
provider: "codex-cli",
|
||||
model: "gpt-5.2-codex",
|
||||
timeoutMs: 1_000,
|
||||
runId: "run-3",
|
||||
cliSessionId: "thread-123",
|
||||
}),
|
||||
).rejects.toThrow("exceeded timeout");
|
||||
});
|
||||
|
||||
it("falls back to per-agent workspace when workspaceDir is missing", async () => {
|
||||
@@ -94,14 +165,18 @@ describe("runCliAgent resume cleanup", () => {
|
||||
},
|
||||
} satisfies OpenClawConfig;
|
||||
|
||||
runExecMock.mockResolvedValue({ stdout: "", stderr: "" });
|
||||
runCommandWithTimeoutMock.mockResolvedValueOnce({
|
||||
stdout: "ok",
|
||||
stderr: "",
|
||||
code: 0,
|
||||
signal: null,
|
||||
killed: false,
|
||||
});
|
||||
supervisorSpawnMock.mockResolvedValueOnce(
|
||||
createManagedRun({
|
||||
reason: "exit",
|
||||
exitCode: 0,
|
||||
exitSignal: null,
|
||||
durationMs: 25,
|
||||
stdout: "ok",
|
||||
stderr: "",
|
||||
timedOut: false,
|
||||
noOutputTimedOut: false,
|
||||
}),
|
||||
);
|
||||
|
||||
try {
|
||||
await runCliAgent({
|
||||
@@ -114,264 +189,33 @@ describe("runCliAgent resume cleanup", () => {
|
||||
provider: "codex-cli",
|
||||
model: "gpt-5.2-codex",
|
||||
timeoutMs: 1_000,
|
||||
runId: "run-1",
|
||||
runId: "run-4",
|
||||
});
|
||||
} finally {
|
||||
await fs.rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
const options = runCommandWithTimeoutMock.mock.calls[0]?.[1] as { cwd?: string };
|
||||
expect(options.cwd).toBe(path.resolve(fallbackWorkspace));
|
||||
const input = supervisorSpawnMock.mock.calls[0]?.[0] as { cwd?: string };
|
||||
expect(input.cwd).toBe(path.resolve(fallbackWorkspace));
|
||||
});
|
||||
});
|
||||
|
||||
it("throws when sessionKey is malformed", async () => {
|
||||
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-cli-runner-"));
|
||||
const mainWorkspace = path.join(tempDir, "workspace-main");
|
||||
const researchWorkspace = path.join(tempDir, "workspace-research");
|
||||
await fs.mkdir(mainWorkspace, { recursive: true });
|
||||
await fs.mkdir(researchWorkspace, { recursive: true });
|
||||
const cfg = {
|
||||
agents: {
|
||||
defaults: {
|
||||
workspace: mainWorkspace,
|
||||
describe("resolveCliNoOutputTimeoutMs", () => {
|
||||
it("uses backend-configured resume watchdog override", () => {
|
||||
const timeoutMs = resolveCliNoOutputTimeoutMs({
|
||||
backend: {
|
||||
command: "codex",
|
||||
reliability: {
|
||||
watchdog: {
|
||||
resume: {
|
||||
noOutputTimeoutMs: 42_000,
|
||||
},
|
||||
},
|
||||
},
|
||||
list: [{ id: "research", workspace: researchWorkspace }],
|
||||
},
|
||||
} satisfies OpenClawConfig;
|
||||
|
||||
try {
|
||||
await expect(
|
||||
runCliAgent({
|
||||
sessionId: "s1",
|
||||
sessionKey: "agent::broken",
|
||||
agentId: "research",
|
||||
sessionFile: "/tmp/session.jsonl",
|
||||
workspaceDir: undefined as unknown as string,
|
||||
config: cfg,
|
||||
prompt: "hi",
|
||||
provider: "codex-cli",
|
||||
model: "gpt-5.2-codex",
|
||||
timeoutMs: 1_000,
|
||||
runId: "run-2",
|
||||
}),
|
||||
).rejects.toThrow("Malformed agent session key");
|
||||
} finally {
|
||||
await fs.rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
expect(runCommandWithTimeoutMock).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe("cleanupSuspendedCliProcesses", () => {
|
||||
beforeEach(() => {
|
||||
runExecMock.mockReset();
|
||||
});
|
||||
|
||||
it("skips when no session tokens are configured", async () => {
|
||||
await cleanupSuspendedCliProcesses(
|
||||
{
|
||||
command: "tool",
|
||||
} as CliBackendConfig,
|
||||
0,
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("matches sessionArg-based commands", async () => {
|
||||
const selfPid = process.pid;
|
||||
runExecMock
|
||||
.mockResolvedValueOnce({
|
||||
stdout: [
|
||||
` 40 ${selfPid} T+ claude --session-id thread-1 -p`,
|
||||
` 41 ${selfPid} S claude --session-id thread-2 -p`,
|
||||
].join("\n"),
|
||||
stderr: "",
|
||||
})
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" });
|
||||
|
||||
await cleanupSuspendedCliProcesses(
|
||||
{
|
||||
command: "claude",
|
||||
sessionArg: "--session-id",
|
||||
} as CliBackendConfig,
|
||||
0,
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
expect(runExecMock).toHaveBeenCalledTimes(2);
|
||||
const killCall = runExecMock.mock.calls[1] ?? [];
|
||||
expect(killCall[0]).toBe("kill");
|
||||
expect(killCall[1]).toEqual(["-9", "40"]);
|
||||
});
|
||||
|
||||
it("matches resumeArgs with positional session id", async () => {
|
||||
const selfPid = process.pid;
|
||||
runExecMock
|
||||
.mockResolvedValueOnce({
|
||||
stdout: [
|
||||
` 50 ${selfPid} T codex exec resume thread-99 --color never --sandbox read-only`,
|
||||
` 51 ${selfPid} T codex exec resume other --color never --sandbox read-only`,
|
||||
].join("\n"),
|
||||
stderr: "",
|
||||
})
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" });
|
||||
|
||||
await cleanupSuspendedCliProcesses(
|
||||
{
|
||||
command: "codex",
|
||||
resumeArgs: ["exec", "resume", "{sessionId}", "--color", "never", "--sandbox", "read-only"],
|
||||
} as CliBackendConfig,
|
||||
1,
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
expect(runExecMock).toHaveBeenCalledTimes(2);
|
||||
const killCall = runExecMock.mock.calls[1] ?? [];
|
||||
expect(killCall[0]).toBe("kill");
|
||||
expect(killCall[1]).toEqual(["-9", "50", "51"]);
|
||||
});
|
||||
|
||||
it("only kills child processes of current process (ppid validation)", async () => {
|
||||
const selfPid = process.pid;
|
||||
const childPid = selfPid + 1;
|
||||
const unrelatedPid = 9999;
|
||||
|
||||
runExecMock
|
||||
.mockResolvedValueOnce({
|
||||
stdout: [
|
||||
` ${childPid} ${selfPid} T claude --session-id thread-1 -p`,
|
||||
` ${unrelatedPid} 100 T claude --session-id thread-2 -p`,
|
||||
].join("\n"),
|
||||
stderr: "",
|
||||
})
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" });
|
||||
|
||||
await cleanupSuspendedCliProcesses(
|
||||
{
|
||||
command: "claude",
|
||||
sessionArg: "--session-id",
|
||||
} as CliBackendConfig,
|
||||
0,
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
expect(runExecMock).toHaveBeenCalledTimes(2);
|
||||
const killCall = runExecMock.mock.calls[1] ?? [];
|
||||
expect(killCall[0]).toBe("kill");
|
||||
// Only childPid killed; unrelatedPid (ppid=100) excluded
|
||||
expect(killCall[1]).toEqual(["-9", String(childPid)]);
|
||||
});
|
||||
|
||||
it("skips all processes when none are children of current process", async () => {
|
||||
runExecMock.mockResolvedValueOnce({
|
||||
stdout: [
|
||||
" 200 100 T claude --session-id thread-1 -p",
|
||||
" 201 100 T claude --session-id thread-2 -p",
|
||||
].join("\n"),
|
||||
stderr: "",
|
||||
timeoutMs: 120_000,
|
||||
useResume: true,
|
||||
});
|
||||
|
||||
await cleanupSuspendedCliProcesses(
|
||||
{
|
||||
command: "claude",
|
||||
sessionArg: "--session-id",
|
||||
} as CliBackendConfig,
|
||||
0,
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
// Only ps called — no kill because no matching ppid
|
||||
expect(runExecMock).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
});
|
||||
|
||||
describe("cleanupResumeProcesses", () => {
|
||||
beforeEach(() => {
|
||||
runExecMock.mockReset();
|
||||
});
|
||||
|
||||
it("only kills resume processes owned by current process", async () => {
|
||||
const selfPid = process.pid;
|
||||
|
||||
runExecMock
|
||||
.mockResolvedValueOnce({
|
||||
stdout: [
|
||||
` ${selfPid + 1} ${selfPid} codex exec resume abc-123`,
|
||||
` ${selfPid + 2} 999 codex exec resume abc-123`,
|
||||
].join("\n"),
|
||||
stderr: "",
|
||||
})
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" })
|
||||
.mockResolvedValueOnce({ stdout: "", stderr: "" });
|
||||
|
||||
await cleanupResumeProcesses(
|
||||
{
|
||||
command: "codex",
|
||||
resumeArgs: ["exec", "resume", "{sessionId}"],
|
||||
} as CliBackendConfig,
|
||||
"abc-123",
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
expect(runExecMock).toHaveBeenCalledTimes(3);
|
||||
|
||||
const termCall = runExecMock.mock.calls[1] ?? [];
|
||||
expect(termCall[0]).toBe("kill");
|
||||
expect(termCall[1]).toEqual(["-TERM", String(selfPid + 1)]);
|
||||
|
||||
const killCall = runExecMock.mock.calls[2] ?? [];
|
||||
expect(killCall[0]).toBe("kill");
|
||||
expect(killCall[1]).toEqual(["-9", String(selfPid + 1)]);
|
||||
});
|
||||
|
||||
it("skips kill when no resume processes match ppid", async () => {
|
||||
runExecMock.mockResolvedValueOnce({
|
||||
stdout: [" 300 100 codex exec resume abc-123", " 301 200 codex exec resume abc-123"].join(
|
||||
"\n",
|
||||
),
|
||||
stderr: "",
|
||||
});
|
||||
|
||||
await cleanupResumeProcesses(
|
||||
{
|
||||
command: "codex",
|
||||
resumeArgs: ["exec", "resume", "{sessionId}"],
|
||||
} as CliBackendConfig,
|
||||
"abc-123",
|
||||
);
|
||||
|
||||
if (process.platform === "win32") {
|
||||
expect(runExecMock).not.toHaveBeenCalled();
|
||||
return;
|
||||
}
|
||||
|
||||
// Only ps called — no kill because no matching ppid
|
||||
expect(runExecMock).toHaveBeenCalledTimes(1);
|
||||
expect(timeoutMs).toBe(42_000);
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user