fix: codex and similar processes keep dying on pty, solved by refactoring process spawning (#14257)

* exec: clean up PTY resources on timeout and exit * cli: harden resume cleanup and watchdog stalled runs * cli: productionize PTY and resume reliability paths * docs: add PTY process supervision architecture plan * docs: rewrite PTY supervision plan as pre-rewrite baseline * docs: switch PTY supervision plan to one-go execution * docs: add one-line root cause to PTY supervision plan * docs: add OS contracts and test matrix to PTY supervision plan * docs: define process-supervisor package placement and scope * docs: tie supervisor plan to existing CI lanes * docs: place PTY supervisor plan under src/process * refactor(process): route exec and cli runs through supervisor * docs(process): refresh PTY supervision plan * wip * fix(process): harden supervisor timeout and PTY termination * fix(process): harden supervisor adapters env and wait handling * ci: avoid failing formal conformance on comment permissions * test(ui): fix cron request mock argument typing * fix(ui): remove leftover conflict marker * fix: supervise PTY processes (#14257) (openclaw#14257) (thanks @onutc)
2026-05-08 13:11:22 +00:00 · 2026-02-16 09:32:05 +08:00
parent a73e7786e7
commit cd44a0d01e
32 changed files with 2759 additions and 855 deletions
--- a/src/agents/cli-runner.e2e.test.ts
+++ b/src/agents/cli-runner.e2e.test.ts
@@ -3,50 +3,69 @@ import os from "node:os";
 import path from "node:path";
 import { beforeEach, describe, expect, it, vi } from "vitest";
 import type { OpenClawConfig } from "../config/config.js";
-import type { CliBackendConfig } from "../config/types.js";
 import { runCliAgent } from "./cli-runner.js";
-import { cleanupResumeProcesses, cleanupSuspendedCliProcesses } from "./cli-runner/helpers.js";
+import { resolveCliNoOutputTimeoutMs } from "./cli-runner/helpers.js";

-const runCommandWithTimeoutMock = vi.fn();
-const runExecMock = vi.fn();
+const supervisorSpawnMock = vi.fn();

-vi.mock("../process/exec.js", () => ({
-  runCommandWithTimeout: (...args: unknown[]) => runCommandWithTimeoutMock(...args),
-  runExec: (...args: unknown[]) => runExecMock(...args),
+vi.mock("../process/supervisor/index.js", () => ({
+  getProcessSupervisor: () => ({
+    spawn: (...args: unknown[]) => supervisorSpawnMock(...args),
+    cancel: vi.fn(),
+    cancelScope: vi.fn(),
+    reconcileOrphans: vi.fn(),
+    getRecord: vi.fn(),
+  }),
 }));

-describe("runCliAgent resume cleanup", () => {
+type MockRunExit = {
+  reason:
+    | "manual-cancel"
+    | "overall-timeout"
+    | "no-output-timeout"
+    | "spawn-error"
+    | "signal"
+    | "exit";
+  exitCode: number | null;
+  exitSignal: NodeJS.Signals | number | null;
+  durationMs: number;
+  stdout: string;
+  stderr: string;
+  timedOut: boolean;
+  noOutputTimedOut: boolean;
+};
+
+function createManagedRun(exit: MockRunExit, pid = 1234) {
+  return {
+    runId: "run-supervisor",
+    pid,
+    startedAtMs: Date.now(),
+    stdin: undefined,
+    wait: vi.fn().mockResolvedValue(exit),
+    cancel: vi.fn(),
+  };
+}
+
+describe("runCliAgent with process supervisor", () => {
  beforeEach(() => {
-    runCommandWithTimeoutMock.mockReset();
-    runExecMock.mockReset();
+    supervisorSpawnMock.mockReset();
  });

-  it("kills stale resume processes for codex sessions", async () => {
-    const selfPid = process.pid;
-
-    runExecMock
-      .mockResolvedValueOnce({
-        stdout: "  1 999 S /bin/launchd\n",
+  it("runs CLI through supervisor and returns payload", async () => {
+    supervisorSpawnMock.mockResolvedValueOnce(
+      createManagedRun({
+        reason: "exit",
+        exitCode: 0,
+        exitSignal: null,
+        durationMs: 50,
+        stdout: "ok",
        stderr: "",
-      }) // cleanupSuspendedCliProcesses (ps) — ppid 999 != selfPid, no match
-      .mockResolvedValueOnce({
-        stdout: [
-          `  ${selfPid + 1} ${selfPid} codex exec resume thread-123 --color never --sandbox read-only --skip-git-repo-check`,
-          `  ${selfPid + 2} 999 codex exec resume thread-123 --color never --sandbox read-only --skip-git-repo-check`,
-        ].join("\n"),
-        stderr: "",
-      }) // cleanupResumeProcesses (ps)
-      .mockResolvedValueOnce({ stdout: "", stderr: "" }) // cleanupResumeProcesses (kill -TERM)
-      .mockResolvedValueOnce({ stdout: "", stderr: "" }); // cleanupResumeProcesses (kill -9)
-    runCommandWithTimeoutMock.mockResolvedValueOnce({
-      stdout: "ok",
-      stderr: "",
-      code: 0,
-      signal: null,
-      killed: false,
-    });
+        timedOut: false,
+        noOutputTimedOut: false,
+      }),
+    );

-    await runCliAgent({
+    const result = await runCliAgent({
      sessionId: "s1",
      sessionFile: "/tmp/session.jsonl",
      workspaceDir: "/tmp",
@@ -58,28 +77,80 @@ describe("runCliAgent resume cleanup", () => {
      cliSessionId: "thread-123",
    });

-    if (process.platform === "win32") {
-      expect(runExecMock).not.toHaveBeenCalled();
-      return;
-    }
+    expect(result.payloads?.[0]?.text).toBe("ok");
+    expect(supervisorSpawnMock).toHaveBeenCalledTimes(1);
+    const input = supervisorSpawnMock.mock.calls[0]?.[0] as {
+      argv?: string[];
+      mode?: string;
+      timeoutMs?: number;
+      noOutputTimeoutMs?: number;
+      replaceExistingScope?: boolean;
+      scopeKey?: string;
+    };
+    expect(input.mode).toBe("child");
+    expect(input.argv?.[0]).toBe("codex");
+    expect(input.timeoutMs).toBe(1_000);
+    expect(input.noOutputTimeoutMs).toBeGreaterThanOrEqual(1_000);
+    expect(input.replaceExistingScope).toBe(true);
+    expect(input.scopeKey).toContain("thread-123");
+  });

-    expect(runExecMock).toHaveBeenCalledTimes(4);
+  it("fails with timeout when no-output watchdog trips", async () => {
+    supervisorSpawnMock.mockResolvedValueOnce(
+      createManagedRun({
+        reason: "no-output-timeout",
+        exitCode: null,
+        exitSignal: "SIGKILL",
+        durationMs: 200,
+        stdout: "",
+        stderr: "",
+        timedOut: true,
+        noOutputTimedOut: true,
+      }),
+    );

-    // Second call: cleanupResumeProcesses ps
-    const psCall = runExecMock.mock.calls[1] ?? [];
-    expect(psCall[0]).toBe("ps");
+    await expect(
+      runCliAgent({
+        sessionId: "s1",
+        sessionFile: "/tmp/session.jsonl",
+        workspaceDir: "/tmp",
+        prompt: "hi",
+        provider: "codex-cli",
+        model: "gpt-5.2-codex",
+        timeoutMs: 1_000,
+        runId: "run-2",
+        cliSessionId: "thread-123",
+      }),
+    ).rejects.toThrow("produced no output");
+  });

-    // Third call: TERM, only the child PID
-    const termCall = runExecMock.mock.calls[2] ?? [];
-    expect(termCall[0]).toBe("kill");
-    const termArgs = termCall[1] as string[];
-    expect(termArgs).toEqual(["-TERM", String(selfPid + 1)]);
+  it("fails with timeout when overall timeout trips", async () => {
+    supervisorSpawnMock.mockResolvedValueOnce(
+      createManagedRun({
+        reason: "overall-timeout",
+        exitCode: null,
+        exitSignal: "SIGKILL",
+        durationMs: 200,
+        stdout: "",
+        stderr: "",
+        timedOut: true,
+        noOutputTimedOut: false,
+      }),
+    );

-    // Fourth call: KILL, only the child PID
-    const killCall = runExecMock.mock.calls[3] ?? [];
-    expect(killCall[0]).toBe("kill");
-    const killArgs = killCall[1] as string[];
-    expect(killArgs).toEqual(["-9", String(selfPid + 1)]);
+    await expect(
+      runCliAgent({
+        sessionId: "s1",
+        sessionFile: "/tmp/session.jsonl",
+        workspaceDir: "/tmp",
+        prompt: "hi",
+        provider: "codex-cli",
+        model: "gpt-5.2-codex",
+        timeoutMs: 1_000,
+        runId: "run-3",
+        cliSessionId: "thread-123",
+      }),
+    ).rejects.toThrow("exceeded timeout");
  });

  it("falls back to per-agent workspace when workspaceDir is missing", async () => {
@@ -94,14 +165,18 @@ describe("runCliAgent resume cleanup", () => {
      },
    } satisfies OpenClawConfig;

-    runExecMock.mockResolvedValue({ stdout: "", stderr: "" });
-    runCommandWithTimeoutMock.mockResolvedValueOnce({
-      stdout: "ok",
-      stderr: "",
-      code: 0,
-      signal: null,
-      killed: false,
-    });
+    supervisorSpawnMock.mockResolvedValueOnce(
+      createManagedRun({
+        reason: "exit",
+        exitCode: 0,
+        exitSignal: null,
+        durationMs: 25,
+        stdout: "ok",
+        stderr: "",
+        timedOut: false,
+        noOutputTimedOut: false,
+      }),
+    );

    try {
      await runCliAgent({
@@ -114,264 +189,33 @@ describe("runCliAgent resume cleanup", () => {
        provider: "codex-cli",
        model: "gpt-5.2-codex",
        timeoutMs: 1_000,
-        runId: "run-1",
+        runId: "run-4",
      });
    } finally {
      await fs.rm(tempDir, { recursive: true, force: true });
    }

-    const options = runCommandWithTimeoutMock.mock.calls[0]?.[1] as { cwd?: string };
-    expect(options.cwd).toBe(path.resolve(fallbackWorkspace));
+    const input = supervisorSpawnMock.mock.calls[0]?.[0] as { cwd?: string };
+    expect(input.cwd).toBe(path.resolve(fallbackWorkspace));
  });
+});

-  it("throws when sessionKey is malformed", async () => {
-    const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-cli-runner-"));
-    const mainWorkspace = path.join(tempDir, "workspace-main");
-    const researchWorkspace = path.join(tempDir, "workspace-research");
-    await fs.mkdir(mainWorkspace, { recursive: true });
-    await fs.mkdir(researchWorkspace, { recursive: true });
-    const cfg = {
-      agents: {
-        defaults: {
-          workspace: mainWorkspace,
+describe("resolveCliNoOutputTimeoutMs", () => {
+  it("uses backend-configured resume watchdog override", () => {
+    const timeoutMs = resolveCliNoOutputTimeoutMs({
+      backend: {
+        command: "codex",
+        reliability: {
+          watchdog: {
+            resume: {
+              noOutputTimeoutMs: 42_000,
+            },
+          },
        },
-        list: [{ id: "research", workspace: researchWorkspace }],
      },
-    } satisfies OpenClawConfig;
-
-    try {
-      await expect(
-        runCliAgent({
-          sessionId: "s1",
-          sessionKey: "agent::broken",
-          agentId: "research",
-          sessionFile: "/tmp/session.jsonl",
-          workspaceDir: undefined as unknown as string,
-          config: cfg,
-          prompt: "hi",
-          provider: "codex-cli",
-          model: "gpt-5.2-codex",
-          timeoutMs: 1_000,
-          runId: "run-2",
-        }),
-      ).rejects.toThrow("Malformed agent session key");
-    } finally {
-      await fs.rm(tempDir, { recursive: true, force: true });
-    }
-    expect(runCommandWithTimeoutMock).not.toHaveBeenCalled();
-  });
-});
-
-describe("cleanupSuspendedCliProcesses", () => {
-  beforeEach(() => {
-    runExecMock.mockReset();
-  });
-
-  it("skips when no session tokens are configured", async () => {
-    await cleanupSuspendedCliProcesses(
-      {
-        command: "tool",
-      } as CliBackendConfig,
-      0,
-    );
-
-    if (process.platform === "win32") {
-      expect(runExecMock).not.toHaveBeenCalled();
-      return;
-    }
-
-    expect(runExecMock).not.toHaveBeenCalled();
-  });
-
-  it("matches sessionArg-based commands", async () => {
-    const selfPid = process.pid;
-    runExecMock
-      .mockResolvedValueOnce({
-        stdout: [
-          `  40 ${selfPid} T+ claude --session-id thread-1 -p`,
-          `  41 ${selfPid} S  claude --session-id thread-2 -p`,
-        ].join("\n"),
-        stderr: "",
-      })
-      .mockResolvedValueOnce({ stdout: "", stderr: "" });
-
-    await cleanupSuspendedCliProcesses(
-      {
-        command: "claude",
-        sessionArg: "--session-id",
-      } as CliBackendConfig,
-      0,
-    );
-
-    if (process.platform === "win32") {
-      expect(runExecMock).not.toHaveBeenCalled();
-      return;
-    }
-
-    expect(runExecMock).toHaveBeenCalledTimes(2);
-    const killCall = runExecMock.mock.calls[1] ?? [];
-    expect(killCall[0]).toBe("kill");
-    expect(killCall[1]).toEqual(["-9", "40"]);
-  });
-
-  it("matches resumeArgs with positional session id", async () => {
-    const selfPid = process.pid;
-    runExecMock
-      .mockResolvedValueOnce({
-        stdout: [
-          `  50 ${selfPid} T  codex exec resume thread-99 --color never --sandbox read-only`,
-          `  51 ${selfPid} T  codex exec resume other --color never --sandbox read-only`,
-        ].join("\n"),
-        stderr: "",
-      })
-      .mockResolvedValueOnce({ stdout: "", stderr: "" });
-
-    await cleanupSuspendedCliProcesses(
-      {
-        command: "codex",
-        resumeArgs: ["exec", "resume", "{sessionId}", "--color", "never", "--sandbox", "read-only"],
-      } as CliBackendConfig,
-      1,
-    );
-
-    if (process.platform === "win32") {
-      expect(runExecMock).not.toHaveBeenCalled();
-      return;
-    }
-
-    expect(runExecMock).toHaveBeenCalledTimes(2);
-    const killCall = runExecMock.mock.calls[1] ?? [];
-    expect(killCall[0]).toBe("kill");
-    expect(killCall[1]).toEqual(["-9", "50", "51"]);
-  });
-
-  it("only kills child processes of current process (ppid validation)", async () => {
-    const selfPid = process.pid;
-    const childPid = selfPid + 1;
-    const unrelatedPid = 9999;
-
-    runExecMock
-      .mockResolvedValueOnce({
-        stdout: [
-          `  ${childPid} ${selfPid} T  claude --session-id thread-1 -p`,
-          `  ${unrelatedPid} 100 T  claude --session-id thread-2 -p`,
-        ].join("\n"),
-        stderr: "",
-      })
-      .mockResolvedValueOnce({ stdout: "", stderr: "" });
-
-    await cleanupSuspendedCliProcesses(
-      {
-        command: "claude",
-        sessionArg: "--session-id",
-      } as CliBackendConfig,
-      0,
-    );
-
-    if (process.platform === "win32") {
-      expect(runExecMock).not.toHaveBeenCalled();
-      return;
-    }
-
-    expect(runExecMock).toHaveBeenCalledTimes(2);
-    const killCall = runExecMock.mock.calls[1] ?? [];
-    expect(killCall[0]).toBe("kill");
-    // Only childPid killed; unrelatedPid (ppid=100) excluded
-    expect(killCall[1]).toEqual(["-9", String(childPid)]);
-  });
-
-  it("skips all processes when none are children of current process", async () => {
-    runExecMock.mockResolvedValueOnce({
-      stdout: [
-        "  200 100 T  claude --session-id thread-1 -p",
-        "  201 100 T  claude --session-id thread-2 -p",
-      ].join("\n"),
-      stderr: "",
+      timeoutMs: 120_000,
+      useResume: true,
    });
-
-    await cleanupSuspendedCliProcesses(
-      {
-        command: "claude",
-        sessionArg: "--session-id",
-      } as CliBackendConfig,
-      0,
-    );
-
-    if (process.platform === "win32") {
-      expect(runExecMock).not.toHaveBeenCalled();
-      return;
-    }
-
-    // Only ps called — no kill because no matching ppid
-    expect(runExecMock).toHaveBeenCalledTimes(1);
-  });
-});
-
-describe("cleanupResumeProcesses", () => {
-  beforeEach(() => {
-    runExecMock.mockReset();
-  });
-
-  it("only kills resume processes owned by current process", async () => {
-    const selfPid = process.pid;
-
-    runExecMock
-      .mockResolvedValueOnce({
-        stdout: [
-          `  ${selfPid + 1} ${selfPid} codex exec resume abc-123`,
-          `  ${selfPid + 2} 999 codex exec resume abc-123`,
-        ].join("\n"),
-        stderr: "",
-      })
-      .mockResolvedValueOnce({ stdout: "", stderr: "" })
-      .mockResolvedValueOnce({ stdout: "", stderr: "" });
-
-    await cleanupResumeProcesses(
-      {
-        command: "codex",
-        resumeArgs: ["exec", "resume", "{sessionId}"],
-      } as CliBackendConfig,
-      "abc-123",
-    );
-
-    if (process.platform === "win32") {
-      expect(runExecMock).not.toHaveBeenCalled();
-      return;
-    }
-
-    expect(runExecMock).toHaveBeenCalledTimes(3);
-
-    const termCall = runExecMock.mock.calls[1] ?? [];
-    expect(termCall[0]).toBe("kill");
-    expect(termCall[1]).toEqual(["-TERM", String(selfPid + 1)]);
-
-    const killCall = runExecMock.mock.calls[2] ?? [];
-    expect(killCall[0]).toBe("kill");
-    expect(killCall[1]).toEqual(["-9", String(selfPid + 1)]);
-  });
-
-  it("skips kill when no resume processes match ppid", async () => {
-    runExecMock.mockResolvedValueOnce({
-      stdout: ["  300 100 codex exec resume abc-123", "  301 200 codex exec resume abc-123"].join(
-        "\n",
-      ),
-      stderr: "",
-    });
-
-    await cleanupResumeProcesses(
-      {
-        command: "codex",
-        resumeArgs: ["exec", "resume", "{sessionId}"],
-      } as CliBackendConfig,
-      "abc-123",
-    );
-
-    if (process.platform === "win32") {
-      expect(runExecMock).not.toHaveBeenCalled();
-      return;
-    }
-
-    // Only ps called — no kill because no matching ppid
-    expect(runExecMock).toHaveBeenCalledTimes(1);
+    expect(timeoutMs).toBe(42_000);
  });
 });