mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 04:19:34 +00:00
fix(queue): harden drain/abort/timeout race handling
- reject new lane enqueues once gateway drain begins - always reset lane draining state and isolate onWait callback failures - persist per-session abort cutoff and skip stale queued messages - avoid false 600s agentTurn timeout in isolated cron jobs Fixes #27407 Fixes #27332 Fixes #27427 Co-authored-by: Kevin Shenghui <shenghuikevin@github.com> Co-authored-by: zjmy <zhangjunmengyang@gmail.com> Co-authored-by: suko <miha.sukic@gmail.com>
This commit is contained in:
@@ -9,6 +9,7 @@ const consumeGatewaySigusr1RestartAuthorization = vi.fn(() => true);
|
||||
const isGatewaySigusr1RestartExternallyAllowed = vi.fn(() => false);
|
||||
const markGatewaySigusr1RestartHandled = vi.fn();
|
||||
const getActiveTaskCount = vi.fn(() => 0);
|
||||
const markGatewayDraining = vi.fn();
|
||||
const waitForActiveTasks = vi.fn(async (_timeoutMs: number) => ({ drained: true }));
|
||||
const resetAllLanes = vi.fn();
|
||||
const restartGatewayProcessWithFreshPid = vi.fn<
|
||||
@@ -37,6 +38,7 @@ vi.mock("../../infra/process-respawn.js", () => ({
|
||||
|
||||
vi.mock("../../process/command-queue.js", () => ({
|
||||
getActiveTaskCount: () => getActiveTaskCount(),
|
||||
markGatewayDraining: () => markGatewayDraining(),
|
||||
waitForActiveTasks: (timeoutMs: number) => waitForActiveTasks(timeoutMs),
|
||||
resetAllLanes: () => resetAllLanes(),
|
||||
}));
|
||||
@@ -213,6 +215,7 @@ describe("runGatewayLoop", () => {
|
||||
await new Promise<void>((resolve) => setImmediate(resolve));
|
||||
|
||||
expect(waitForActiveTasks).toHaveBeenCalledWith(30_000);
|
||||
expect(markGatewayDraining).toHaveBeenCalledTimes(1);
|
||||
expect(gatewayLog.warn).toHaveBeenCalledWith(DRAIN_TIMEOUT_LOG);
|
||||
expect(closeFirst).toHaveBeenCalledWith({
|
||||
reason: "gateway restarting",
|
||||
@@ -229,6 +232,7 @@ describe("runGatewayLoop", () => {
|
||||
restartExpectedMs: 1500,
|
||||
});
|
||||
expect(markGatewaySigusr1RestartHandled).toHaveBeenCalledTimes(2);
|
||||
expect(markGatewayDraining).toHaveBeenCalledTimes(2);
|
||||
expect(resetAllLanes).toHaveBeenCalledTimes(2);
|
||||
expect(acquireGatewayLock).toHaveBeenCalledTimes(3);
|
||||
});
|
||||
|
||||
@@ -9,6 +9,7 @@ import {
|
||||
import { createSubsystemLogger } from "../../logging/subsystem.js";
|
||||
import {
|
||||
getActiveTaskCount,
|
||||
markGatewayDraining,
|
||||
resetAllLanes,
|
||||
waitForActiveTasks,
|
||||
} from "../../process/command-queue.js";
|
||||
@@ -111,6 +112,9 @@ export async function runGatewayLoop(params: {
|
||||
// On restart, wait for in-flight agent turns to finish before
|
||||
// tearing down the server so buffered messages are delivered.
|
||||
if (isRestart) {
|
||||
// Reject new enqueues immediately during the drain window so
|
||||
// sessions get an explicit restart error instead of silent task loss.
|
||||
markGatewayDraining();
|
||||
const activeTasks = getActiveTaskCount();
|
||||
if (activeTasks > 0) {
|
||||
gatewayLog.info(
|
||||
|
||||
Reference in New Issue
Block a user