fix: codex and similar processes keep dying on pty, solved by refactoring process spawning (#14257)

* exec: clean up PTY resources on timeout and exit

* cli: harden resume cleanup and watchdog stalled runs

* cli: productionize PTY and resume reliability paths

* docs: add PTY process supervision architecture plan

* docs: rewrite PTY supervision plan as pre-rewrite baseline

* docs: switch PTY supervision plan to one-go execution

* docs: add one-line root cause to PTY supervision plan

* docs: add OS contracts and test matrix to PTY supervision plan

* docs: define process-supervisor package placement and scope

* docs: tie supervisor plan to existing CI lanes

* docs: place PTY supervisor plan under src/process

* refactor(process): route exec and cli runs through supervisor

* docs(process): refresh PTY supervision plan

* wip

* fix(process): harden supervisor timeout and PTY termination

* fix(process): harden supervisor adapters env and wait handling

* ci: avoid failing formal conformance on comment permissions

* test(ui): fix cron request mock argument typing

* fix(ui): remove leftover conflict marker

* fix: supervise PTY processes (#14257) (openclaw#14257) (thanks @onutc)
This commit is contained in:
Onur
2026-02-16 09:32:05 +08:00
committed by GitHub
parent a73e7786e7
commit cd44a0d01e
32 changed files with 2759 additions and 855 deletions

View File

@@ -0,0 +1,197 @@
import type { ManagedRunStdin } from "../types.js";
import { killProcessTree } from "../../kill-tree.js";
type PtyExitEvent = { exitCode: number; signal?: number };
type PtyDisposable = { dispose: () => void };
type PtySpawnHandle = {
pid: number;
write: (data: string | Buffer) => void;
onData: (listener: (value: string) => void) => PtyDisposable | void;
onExit: (listener: (event: PtyExitEvent) => void) => PtyDisposable | void;
kill: (signal?: string) => void;
};
type PtySpawn = (
file: string,
args: string[] | string,
options: {
name?: string;
cols?: number;
rows?: number;
cwd?: string;
env?: Record<string, string>;
},
) => PtySpawnHandle;
type PtyModule = {
spawn?: PtySpawn;
default?: {
spawn?: PtySpawn;
};
};
function toStringEnv(env?: NodeJS.ProcessEnv): Record<string, string> {
if (!env) {
return {};
}
const out: Record<string, string> = {};
for (const [key, value] of Object.entries(env)) {
if (value === undefined) {
continue;
}
out[key] = String(value);
}
return out;
}
export type PtyAdapter = {
pid?: number;
stdin?: ManagedRunStdin;
onStdout: (listener: (chunk: string) => void) => void;
onStderr: (listener: (chunk: string) => void) => void;
wait: () => Promise<{ code: number | null; signal: NodeJS.Signals | number | null }>;
kill: (signal?: NodeJS.Signals) => void;
dispose: () => void;
};
export async function createPtyAdapter(params: {
shell: string;
args: string[];
cwd?: string;
env?: NodeJS.ProcessEnv;
cols?: number;
rows?: number;
name?: string;
}): Promise<PtyAdapter> {
const module = (await import("@lydell/node-pty")) as unknown as PtyModule;
const spawn = module.spawn ?? module.default?.spawn;
if (!spawn) {
throw new Error("PTY support is unavailable (node-pty spawn not found).");
}
const pty = spawn(params.shell, params.args, {
cwd: params.cwd,
env: params.env ? toStringEnv(params.env) : undefined,
name: params.name ?? process.env.TERM ?? "xterm-256color",
cols: params.cols ?? 120,
rows: params.rows ?? 30,
});
let dataListener: PtyDisposable | null = null;
let exitListener: PtyDisposable | null = null;
let waitResult: { code: number | null; signal: NodeJS.Signals | number | null } | null = null;
let resolveWait:
| ((value: { code: number | null; signal: NodeJS.Signals | number | null }) => void)
| null = null;
let waitPromise: Promise<{ code: number | null; signal: NodeJS.Signals | number | null }> | null =
null;
const settleWait = (value: { code: number | null; signal: NodeJS.Signals | number | null }) => {
if (waitResult) {
return;
}
waitResult = value;
if (resolveWait) {
const resolve = resolveWait;
resolveWait = null;
resolve(value);
}
};
exitListener =
pty.onExit((event) => {
const signal = event.signal && event.signal !== 0 ? event.signal : null;
settleWait({ code: event.exitCode ?? null, signal });
}) ?? null;
const stdin: ManagedRunStdin = {
destroyed: false,
write: (data, cb) => {
try {
pty.write(data);
cb?.(null);
} catch (err) {
cb?.(err as Error);
}
},
end: () => {
try {
const eof = process.platform === "win32" ? "\x1a" : "\x04";
pty.write(eof);
} catch {
// ignore EOF errors
}
},
};
const onStdout = (listener: (chunk: string) => void) => {
dataListener =
pty.onData((chunk) => {
listener(chunk.toString());
}) ?? null;
};
const onStderr = (_listener: (chunk: string) => void) => {
// PTY gives a unified output stream.
};
const wait = async () => {
if (waitResult) {
return waitResult;
}
if (!waitPromise) {
waitPromise = new Promise<{ code: number | null; signal: NodeJS.Signals | number | null }>(
(resolve) => {
resolveWait = resolve;
if (waitResult) {
const settled = waitResult;
resolveWait = null;
resolve(settled);
}
},
);
}
return waitPromise;
};
const kill = (signal: NodeJS.Signals = "SIGKILL") => {
try {
if (signal === "SIGKILL" && typeof pty.pid === "number" && pty.pid > 0) {
killProcessTree(pty.pid);
} else if (process.platform === "win32") {
pty.kill();
} else {
pty.kill(signal);
}
} catch {
// ignore kill errors
}
// Some PTY hosts do not emit `onExit` reliably after kill.
// Ensure waiters can progress on forced termination.
settleWait({ code: null, signal });
};
const dispose = () => {
try {
dataListener?.dispose();
} catch {
// ignore disposal errors
}
try {
exitListener?.dispose();
} catch {
// ignore disposal errors
}
dataListener = null;
exitListener = null;
settleWait({ code: null, signal: null });
};
return {
pid: pty.pid || undefined,
stdin,
onStdout,
onStderr,
wait,
kill,
dispose,
};
}