fix: codex and similar processes keep dying on pty, solved by refactoring process spawning (#14257)

* exec: clean up PTY resources on timeout and exit

* cli: harden resume cleanup and watchdog stalled runs

* cli: productionize PTY and resume reliability paths

* docs: add PTY process supervision architecture plan

* docs: rewrite PTY supervision plan as pre-rewrite baseline

* docs: switch PTY supervision plan to one-go execution

* docs: add one-line root cause to PTY supervision plan

* docs: add OS contracts and test matrix to PTY supervision plan

* docs: define process-supervisor package placement and scope

* docs: tie supervisor plan to existing CI lanes

* docs: place PTY supervisor plan under src/process

* refactor(process): route exec and cli runs through supervisor

* docs(process): refresh PTY supervision plan

* wip

* fix(process): harden supervisor timeout and PTY termination

* fix(process): harden supervisor adapters env and wait handling

* ci: avoid failing formal conformance on comment permissions

* test(ui): fix cron request mock argument typing

* fix(ui): remove leftover conflict marker

* fix: supervise PTY processes (#14257) (openclaw#14257) (thanks @onutc)
This commit is contained in:
Onur
2026-02-16 09:32:05 +08:00
committed by GitHub
parent a73e7786e7
commit cd44a0d01e
32 changed files with 2759 additions and 855 deletions

View File

@@ -3,50 +3,69 @@ import os from "node:os";
import path from "node:path";
import { beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../config/config.js";
import type { CliBackendConfig } from "../config/types.js";
import { runCliAgent } from "./cli-runner.js";
import { cleanupResumeProcesses, cleanupSuspendedCliProcesses } from "./cli-runner/helpers.js";
import { resolveCliNoOutputTimeoutMs } from "./cli-runner/helpers.js";
const runCommandWithTimeoutMock = vi.fn();
const runExecMock = vi.fn();
const supervisorSpawnMock = vi.fn();
vi.mock("../process/exec.js", () => ({
runCommandWithTimeout: (...args: unknown[]) => runCommandWithTimeoutMock(...args),
runExec: (...args: unknown[]) => runExecMock(...args),
vi.mock("../process/supervisor/index.js", () => ({
getProcessSupervisor: () => ({
spawn: (...args: unknown[]) => supervisorSpawnMock(...args),
cancel: vi.fn(),
cancelScope: vi.fn(),
reconcileOrphans: vi.fn(),
getRecord: vi.fn(),
}),
}));
describe("runCliAgent resume cleanup", () => {
type MockRunExit = {
reason:
| "manual-cancel"
| "overall-timeout"
| "no-output-timeout"
| "spawn-error"
| "signal"
| "exit";
exitCode: number | null;
exitSignal: NodeJS.Signals | number | null;
durationMs: number;
stdout: string;
stderr: string;
timedOut: boolean;
noOutputTimedOut: boolean;
};
function createManagedRun(exit: MockRunExit, pid = 1234) {
return {
runId: "run-supervisor",
pid,
startedAtMs: Date.now(),
stdin: undefined,
wait: vi.fn().mockResolvedValue(exit),
cancel: vi.fn(),
};
}
describe("runCliAgent with process supervisor", () => {
beforeEach(() => {
runCommandWithTimeoutMock.mockReset();
runExecMock.mockReset();
supervisorSpawnMock.mockReset();
});
it("kills stale resume processes for codex sessions", async () => {
const selfPid = process.pid;
runExecMock
.mockResolvedValueOnce({
stdout: " 1 999 S /bin/launchd\n",
it("runs CLI through supervisor and returns payload", async () => {
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({
reason: "exit",
exitCode: 0,
exitSignal: null,
durationMs: 50,
stdout: "ok",
stderr: "",
}) // cleanupSuspendedCliProcesses (ps) — ppid 999 != selfPid, no match
.mockResolvedValueOnce({
stdout: [
` ${selfPid + 1} ${selfPid} codex exec resume thread-123 --color never --sandbox read-only --skip-git-repo-check`,
` ${selfPid + 2} 999 codex exec resume thread-123 --color never --sandbox read-only --skip-git-repo-check`,
].join("\n"),
stderr: "",
}) // cleanupResumeProcesses (ps)
.mockResolvedValueOnce({ stdout: "", stderr: "" }) // cleanupResumeProcesses (kill -TERM)
.mockResolvedValueOnce({ stdout: "", stderr: "" }); // cleanupResumeProcesses (kill -9)
runCommandWithTimeoutMock.mockResolvedValueOnce({
stdout: "ok",
stderr: "",
code: 0,
signal: null,
killed: false,
});
timedOut: false,
noOutputTimedOut: false,
}),
);
await runCliAgent({
const result = await runCliAgent({
sessionId: "s1",
sessionFile: "/tmp/session.jsonl",
workspaceDir: "/tmp",
@@ -58,28 +77,80 @@ describe("runCliAgent resume cleanup", () => {
cliSessionId: "thread-123",
});
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
expect(result.payloads?.[0]?.text).toBe("ok");
expect(supervisorSpawnMock).toHaveBeenCalledTimes(1);
const input = supervisorSpawnMock.mock.calls[0]?.[0] as {
argv?: string[];
mode?: string;
timeoutMs?: number;
noOutputTimeoutMs?: number;
replaceExistingScope?: boolean;
scopeKey?: string;
};
expect(input.mode).toBe("child");
expect(input.argv?.[0]).toBe("codex");
expect(input.timeoutMs).toBe(1_000);
expect(input.noOutputTimeoutMs).toBeGreaterThanOrEqual(1_000);
expect(input.replaceExistingScope).toBe(true);
expect(input.scopeKey).toContain("thread-123");
});
expect(runExecMock).toHaveBeenCalledTimes(4);
it("fails with timeout when no-output watchdog trips", async () => {
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({
reason: "no-output-timeout",
exitCode: null,
exitSignal: "SIGKILL",
durationMs: 200,
stdout: "",
stderr: "",
timedOut: true,
noOutputTimedOut: true,
}),
);
// Second call: cleanupResumeProcesses ps
const psCall = runExecMock.mock.calls[1] ?? [];
expect(psCall[0]).toBe("ps");
await expect(
runCliAgent({
sessionId: "s1",
sessionFile: "/tmp/session.jsonl",
workspaceDir: "/tmp",
prompt: "hi",
provider: "codex-cli",
model: "gpt-5.2-codex",
timeoutMs: 1_000,
runId: "run-2",
cliSessionId: "thread-123",
}),
).rejects.toThrow("produced no output");
});
// Third call: TERM, only the child PID
const termCall = runExecMock.mock.calls[2] ?? [];
expect(termCall[0]).toBe("kill");
const termArgs = termCall[1] as string[];
expect(termArgs).toEqual(["-TERM", String(selfPid + 1)]);
it("fails with timeout when overall timeout trips", async () => {
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({
reason: "overall-timeout",
exitCode: null,
exitSignal: "SIGKILL",
durationMs: 200,
stdout: "",
stderr: "",
timedOut: true,
noOutputTimedOut: false,
}),
);
// Fourth call: KILL, only the child PID
const killCall = runExecMock.mock.calls[3] ?? [];
expect(killCall[0]).toBe("kill");
const killArgs = killCall[1] as string[];
expect(killArgs).toEqual(["-9", String(selfPid + 1)]);
await expect(
runCliAgent({
sessionId: "s1",
sessionFile: "/tmp/session.jsonl",
workspaceDir: "/tmp",
prompt: "hi",
provider: "codex-cli",
model: "gpt-5.2-codex",
timeoutMs: 1_000,
runId: "run-3",
cliSessionId: "thread-123",
}),
).rejects.toThrow("exceeded timeout");
});
it("falls back to per-agent workspace when workspaceDir is missing", async () => {
@@ -94,14 +165,18 @@ describe("runCliAgent resume cleanup", () => {
},
} satisfies OpenClawConfig;
runExecMock.mockResolvedValue({ stdout: "", stderr: "" });
runCommandWithTimeoutMock.mockResolvedValueOnce({
stdout: "ok",
stderr: "",
code: 0,
signal: null,
killed: false,
});
supervisorSpawnMock.mockResolvedValueOnce(
createManagedRun({
reason: "exit",
exitCode: 0,
exitSignal: null,
durationMs: 25,
stdout: "ok",
stderr: "",
timedOut: false,
noOutputTimedOut: false,
}),
);
try {
await runCliAgent({
@@ -114,264 +189,33 @@ describe("runCliAgent resume cleanup", () => {
provider: "codex-cli",
model: "gpt-5.2-codex",
timeoutMs: 1_000,
runId: "run-1",
runId: "run-4",
});
} finally {
await fs.rm(tempDir, { recursive: true, force: true });
}
const options = runCommandWithTimeoutMock.mock.calls[0]?.[1] as { cwd?: string };
expect(options.cwd).toBe(path.resolve(fallbackWorkspace));
const input = supervisorSpawnMock.mock.calls[0]?.[0] as { cwd?: string };
expect(input.cwd).toBe(path.resolve(fallbackWorkspace));
});
});
it("throws when sessionKey is malformed", async () => {
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-cli-runner-"));
const mainWorkspace = path.join(tempDir, "workspace-main");
const researchWorkspace = path.join(tempDir, "workspace-research");
await fs.mkdir(mainWorkspace, { recursive: true });
await fs.mkdir(researchWorkspace, { recursive: true });
const cfg = {
agents: {
defaults: {
workspace: mainWorkspace,
describe("resolveCliNoOutputTimeoutMs", () => {
it("uses backend-configured resume watchdog override", () => {
const timeoutMs = resolveCliNoOutputTimeoutMs({
backend: {
command: "codex",
reliability: {
watchdog: {
resume: {
noOutputTimeoutMs: 42_000,
},
},
},
list: [{ id: "research", workspace: researchWorkspace }],
},
} satisfies OpenClawConfig;
try {
await expect(
runCliAgent({
sessionId: "s1",
sessionKey: "agent::broken",
agentId: "research",
sessionFile: "/tmp/session.jsonl",
workspaceDir: undefined as unknown as string,
config: cfg,
prompt: "hi",
provider: "codex-cli",
model: "gpt-5.2-codex",
timeoutMs: 1_000,
runId: "run-2",
}),
).rejects.toThrow("Malformed agent session key");
} finally {
await fs.rm(tempDir, { recursive: true, force: true });
}
expect(runCommandWithTimeoutMock).not.toHaveBeenCalled();
});
});
describe("cleanupSuspendedCliProcesses", () => {
beforeEach(() => {
runExecMock.mockReset();
});
it("skips when no session tokens are configured", async () => {
await cleanupSuspendedCliProcesses(
{
command: "tool",
} as CliBackendConfig,
0,
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
expect(runExecMock).not.toHaveBeenCalled();
});
it("matches sessionArg-based commands", async () => {
const selfPid = process.pid;
runExecMock
.mockResolvedValueOnce({
stdout: [
` 40 ${selfPid} T+ claude --session-id thread-1 -p`,
` 41 ${selfPid} S claude --session-id thread-2 -p`,
].join("\n"),
stderr: "",
})
.mockResolvedValueOnce({ stdout: "", stderr: "" });
await cleanupSuspendedCliProcesses(
{
command: "claude",
sessionArg: "--session-id",
} as CliBackendConfig,
0,
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
expect(runExecMock).toHaveBeenCalledTimes(2);
const killCall = runExecMock.mock.calls[1] ?? [];
expect(killCall[0]).toBe("kill");
expect(killCall[1]).toEqual(["-9", "40"]);
});
it("matches resumeArgs with positional session id", async () => {
const selfPid = process.pid;
runExecMock
.mockResolvedValueOnce({
stdout: [
` 50 ${selfPid} T codex exec resume thread-99 --color never --sandbox read-only`,
` 51 ${selfPid} T codex exec resume other --color never --sandbox read-only`,
].join("\n"),
stderr: "",
})
.mockResolvedValueOnce({ stdout: "", stderr: "" });
await cleanupSuspendedCliProcesses(
{
command: "codex",
resumeArgs: ["exec", "resume", "{sessionId}", "--color", "never", "--sandbox", "read-only"],
} as CliBackendConfig,
1,
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
expect(runExecMock).toHaveBeenCalledTimes(2);
const killCall = runExecMock.mock.calls[1] ?? [];
expect(killCall[0]).toBe("kill");
expect(killCall[1]).toEqual(["-9", "50", "51"]);
});
it("only kills child processes of current process (ppid validation)", async () => {
const selfPid = process.pid;
const childPid = selfPid + 1;
const unrelatedPid = 9999;
runExecMock
.mockResolvedValueOnce({
stdout: [
` ${childPid} ${selfPid} T claude --session-id thread-1 -p`,
` ${unrelatedPid} 100 T claude --session-id thread-2 -p`,
].join("\n"),
stderr: "",
})
.mockResolvedValueOnce({ stdout: "", stderr: "" });
await cleanupSuspendedCliProcesses(
{
command: "claude",
sessionArg: "--session-id",
} as CliBackendConfig,
0,
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
expect(runExecMock).toHaveBeenCalledTimes(2);
const killCall = runExecMock.mock.calls[1] ?? [];
expect(killCall[0]).toBe("kill");
// Only childPid killed; unrelatedPid (ppid=100) excluded
expect(killCall[1]).toEqual(["-9", String(childPid)]);
});
it("skips all processes when none are children of current process", async () => {
runExecMock.mockResolvedValueOnce({
stdout: [
" 200 100 T claude --session-id thread-1 -p",
" 201 100 T claude --session-id thread-2 -p",
].join("\n"),
stderr: "",
timeoutMs: 120_000,
useResume: true,
});
await cleanupSuspendedCliProcesses(
{
command: "claude",
sessionArg: "--session-id",
} as CliBackendConfig,
0,
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
// Only ps called — no kill because no matching ppid
expect(runExecMock).toHaveBeenCalledTimes(1);
});
});
describe("cleanupResumeProcesses", () => {
beforeEach(() => {
runExecMock.mockReset();
});
it("only kills resume processes owned by current process", async () => {
const selfPid = process.pid;
runExecMock
.mockResolvedValueOnce({
stdout: [
` ${selfPid + 1} ${selfPid} codex exec resume abc-123`,
` ${selfPid + 2} 999 codex exec resume abc-123`,
].join("\n"),
stderr: "",
})
.mockResolvedValueOnce({ stdout: "", stderr: "" })
.mockResolvedValueOnce({ stdout: "", stderr: "" });
await cleanupResumeProcesses(
{
command: "codex",
resumeArgs: ["exec", "resume", "{sessionId}"],
} as CliBackendConfig,
"abc-123",
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
expect(runExecMock).toHaveBeenCalledTimes(3);
const termCall = runExecMock.mock.calls[1] ?? [];
expect(termCall[0]).toBe("kill");
expect(termCall[1]).toEqual(["-TERM", String(selfPid + 1)]);
const killCall = runExecMock.mock.calls[2] ?? [];
expect(killCall[0]).toBe("kill");
expect(killCall[1]).toEqual(["-9", String(selfPid + 1)]);
});
it("skips kill when no resume processes match ppid", async () => {
runExecMock.mockResolvedValueOnce({
stdout: [" 300 100 codex exec resume abc-123", " 301 200 codex exec resume abc-123"].join(
"\n",
),
stderr: "",
});
await cleanupResumeProcesses(
{
command: "codex",
resumeArgs: ["exec", "resume", "{sessionId}"],
} as CliBackendConfig,
"abc-123",
);
if (process.platform === "win32") {
expect(runExecMock).not.toHaveBeenCalled();
return;
}
// Only ps called — no kill because no matching ppid
expect(runExecMock).toHaveBeenCalledTimes(1);
expect(timeoutMs).toBe(42_000);
});
});