fix: reset stale execution state after SIGUSR1 in-process restart (#15195)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: 676f9ec451
Co-authored-by: joeykrug <5925937+joeykrug@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras
This commit is contained in:
Joseph Krug
2026-02-13 16:30:09 -04:00
committed by GitHub
parent 2086cdfb9b
commit 4e9f933e88
11 changed files with 572 additions and 20 deletions

View File

@@ -52,6 +52,8 @@ async function main() {
{ consumeGatewaySigusr1RestartAuthorization, isGatewaySigusr1RestartExternallyAllowed },
{ defaultRuntime },
{ enableConsoleCapture, setConsoleTimestampPrefix },
commandQueueMod,
{ createRestartIterationHook },
] = await Promise.all([
import("../config/config.js"),
import("../gateway/server.js"),
@@ -61,6 +63,8 @@ async function main() {
import("../infra/restart.js"),
import("../runtime.js"),
import("../logging.js"),
import("../process/command-queue.js"),
import("../process/restart-recovery.js"),
] as const);
enableConsoleCapture();
@@ -132,14 +136,32 @@ async function main() {
`gateway: received ${signal}; ${isRestart ? "restarting" : "shutting down"}`,
);
const DRAIN_TIMEOUT_MS = 30_000;
const SHUTDOWN_TIMEOUT_MS = 5_000;
const forceExitMs = isRestart ? DRAIN_TIMEOUT_MS + SHUTDOWN_TIMEOUT_MS : SHUTDOWN_TIMEOUT_MS;
forceExitTimer = setTimeout(() => {
defaultRuntime.error("gateway: shutdown timed out; exiting without full cleanup");
cleanupSignals();
process.exit(0);
}, 5000);
}, forceExitMs);
void (async () => {
try {
if (isRestart) {
const activeTasks = commandQueueMod.getActiveTaskCount();
if (activeTasks > 0) {
defaultRuntime.log(
`gateway: draining ${activeTasks} active task(s) before restart (timeout ${DRAIN_TIMEOUT_MS}ms)`,
);
const { drained } = await commandQueueMod.waitForActiveTasks(DRAIN_TIMEOUT_MS);
if (drained) {
defaultRuntime.log("gateway: all active tasks drained");
} else {
defaultRuntime.log("gateway: drain timeout reached; proceeding with restart");
}
}
}
await server?.close({
reason: isRestart ? "gateway restarting" : "gateway stopping",
restartExpectedMs: isRestart ? 1500 : null,
@@ -196,8 +218,17 @@ async function main() {
}
throw err;
}
const onIteration = createRestartIterationHook(() => {
// After an in-process restart (SIGUSR1), reset command-queue lane state.
// Interrupted tasks from the previous lifecycle may have left `active`
// counts elevated (their finally blocks never ran), permanently blocking
// new work from draining.
commandQueueMod.resetAllLanes();
});
// eslint-disable-next-line no-constant-condition
while (true) {
onIteration();
try {
server = await startGatewayServer(port, { bind });
} catch (err) {
@@ -210,7 +241,7 @@ async function main() {
});
}
} finally {
await (lock as GatewayLockHandle | null)?.release();
await lock?.release();
cleanupSignals();
}
}