mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-08 09:21:23 +00:00
fix: reset stale execution state after SIGUSR1 in-process restart (#15195)
Merged via /review-pr -> /prepare-pr -> /merge-pr.
Prepared head SHA: 676f9ec451
Co-authored-by: joeykrug <5925937+joeykrug@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras
This commit is contained in:
@@ -52,6 +52,8 @@ async function main() {
|
||||
{ consumeGatewaySigusr1RestartAuthorization, isGatewaySigusr1RestartExternallyAllowed },
|
||||
{ defaultRuntime },
|
||||
{ enableConsoleCapture, setConsoleTimestampPrefix },
|
||||
commandQueueMod,
|
||||
{ createRestartIterationHook },
|
||||
] = await Promise.all([
|
||||
import("../config/config.js"),
|
||||
import("../gateway/server.js"),
|
||||
@@ -61,6 +63,8 @@ async function main() {
|
||||
import("../infra/restart.js"),
|
||||
import("../runtime.js"),
|
||||
import("../logging.js"),
|
||||
import("../process/command-queue.js"),
|
||||
import("../process/restart-recovery.js"),
|
||||
] as const);
|
||||
|
||||
enableConsoleCapture();
|
||||
@@ -132,14 +136,32 @@ async function main() {
|
||||
`gateway: received ${signal}; ${isRestart ? "restarting" : "shutting down"}`,
|
||||
);
|
||||
|
||||
const DRAIN_TIMEOUT_MS = 30_000;
|
||||
const SHUTDOWN_TIMEOUT_MS = 5_000;
|
||||
const forceExitMs = isRestart ? DRAIN_TIMEOUT_MS + SHUTDOWN_TIMEOUT_MS : SHUTDOWN_TIMEOUT_MS;
|
||||
forceExitTimer = setTimeout(() => {
|
||||
defaultRuntime.error("gateway: shutdown timed out; exiting without full cleanup");
|
||||
cleanupSignals();
|
||||
process.exit(0);
|
||||
}, 5000);
|
||||
}, forceExitMs);
|
||||
|
||||
void (async () => {
|
||||
try {
|
||||
if (isRestart) {
|
||||
const activeTasks = commandQueueMod.getActiveTaskCount();
|
||||
if (activeTasks > 0) {
|
||||
defaultRuntime.log(
|
||||
`gateway: draining ${activeTasks} active task(s) before restart (timeout ${DRAIN_TIMEOUT_MS}ms)`,
|
||||
);
|
||||
const { drained } = await commandQueueMod.waitForActiveTasks(DRAIN_TIMEOUT_MS);
|
||||
if (drained) {
|
||||
defaultRuntime.log("gateway: all active tasks drained");
|
||||
} else {
|
||||
defaultRuntime.log("gateway: drain timeout reached; proceeding with restart");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await server?.close({
|
||||
reason: isRestart ? "gateway restarting" : "gateway stopping",
|
||||
restartExpectedMs: isRestart ? 1500 : null,
|
||||
@@ -196,8 +218,17 @@ async function main() {
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
const onIteration = createRestartIterationHook(() => {
|
||||
// After an in-process restart (SIGUSR1), reset command-queue lane state.
|
||||
// Interrupted tasks from the previous lifecycle may have left `active`
|
||||
// counts elevated (their finally blocks never ran), permanently blocking
|
||||
// new work from draining.
|
||||
commandQueueMod.resetAllLanes();
|
||||
});
|
||||
|
||||
// eslint-disable-next-line no-constant-condition
|
||||
while (true) {
|
||||
onIteration();
|
||||
try {
|
||||
server = await startGatewayServer(port, { bind });
|
||||
} catch (err) {
|
||||
@@ -210,7 +241,7 @@ async function main() {
|
||||
});
|
||||
}
|
||||
} finally {
|
||||
await (lock as GatewayLockHandle | null)?.release();
|
||||
await lock?.release();
|
||||
cleanupSignals();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user