fix(queue): harden drain/abort/timeout race handling

- reject new lane enqueues once gateway drain begins
- always reset lane draining state and isolate onWait callback failures
- persist per-session abort cutoff and skip stale queued messages
- avoid false 600s agentTurn timeout in isolated cron jobs

Fixes #27407
Fixes #27332
Fixes #27427

Co-authored-by: Kevin Shenghui <shenghuikevin@github.com>
Co-authored-by: zjmy <zhangjunmengyang@gmail.com>
Co-authored-by: suko <miha.sukic@gmail.com>
This commit is contained in:
Peter Steinberger
2026-02-26 13:43:30 +01:00
parent 1aef45bc06
commit c397a02c9a
13 changed files with 551 additions and 42 deletions

View File

@@ -9,6 +9,7 @@ const consumeGatewaySigusr1RestartAuthorization = vi.fn(() => true);
const isGatewaySigusr1RestartExternallyAllowed = vi.fn(() => false);
const markGatewaySigusr1RestartHandled = vi.fn();
const getActiveTaskCount = vi.fn(() => 0);
const markGatewayDraining = vi.fn();
const waitForActiveTasks = vi.fn(async (_timeoutMs: number) => ({ drained: true }));
const resetAllLanes = vi.fn();
const restartGatewayProcessWithFreshPid = vi.fn<
@@ -37,6 +38,7 @@ vi.mock("../../infra/process-respawn.js", () => ({
vi.mock("../../process/command-queue.js", () => ({
getActiveTaskCount: () => getActiveTaskCount(),
markGatewayDraining: () => markGatewayDraining(),
waitForActiveTasks: (timeoutMs: number) => waitForActiveTasks(timeoutMs),
resetAllLanes: () => resetAllLanes(),
}));
@@ -213,6 +215,7 @@ describe("runGatewayLoop", () => {
await new Promise<void>((resolve) => setImmediate(resolve));
expect(waitForActiveTasks).toHaveBeenCalledWith(30_000);
expect(markGatewayDraining).toHaveBeenCalledTimes(1);
expect(gatewayLog.warn).toHaveBeenCalledWith(DRAIN_TIMEOUT_LOG);
expect(closeFirst).toHaveBeenCalledWith({
reason: "gateway restarting",
@@ -229,6 +232,7 @@ describe("runGatewayLoop", () => {
restartExpectedMs: 1500,
});
expect(markGatewaySigusr1RestartHandled).toHaveBeenCalledTimes(2);
expect(markGatewayDraining).toHaveBeenCalledTimes(2);
expect(resetAllLanes).toHaveBeenCalledTimes(2);
expect(acquireGatewayLock).toHaveBeenCalledTimes(3);
});

View File

@@ -9,6 +9,7 @@ import {
import { createSubsystemLogger } from "../../logging/subsystem.js";
import {
getActiveTaskCount,
markGatewayDraining,
resetAllLanes,
waitForActiveTasks,
} from "../../process/command-queue.js";
@@ -111,6 +112,9 @@ export async function runGatewayLoop(params: {
// On restart, wait for in-flight agent turns to finish before
// tearing down the server so buffered messages are delivered.
if (isRestart) {
// Reject new enqueues immediately during the drain window so
// sessions get an explicit restart error instead of silent task loss.
markGatewayDraining();
const activeTasks = getActiveTaskCount();
if (activeTasks > 0) {
gatewayLog.info(