fix(queue): harden drain/abort/timeout race handling

- reject new lane enqueues once gateway drain begins
- always reset lane draining state and isolate onWait callback failures
- persist per-session abort cutoff and skip stale queued messages
- avoid false 600s agentTurn timeout in isolated cron jobs

Fixes #27407
Fixes #27332
Fixes #27427

Co-authored-by: Kevin Shenghui <shenghuikevin@github.com>
Co-authored-by: zjmy <zhangjunmengyang@gmail.com>
Co-authored-by: suko <miha.sukic@gmail.com>
This commit is contained in:
Peter Steinberger
2026-02-26 13:43:30 +01:00
parent 1aef45bc06
commit c397a02c9a
13 changed files with 551 additions and 42 deletions

View File

@@ -9,7 +9,7 @@ import { CronService } from "./service.js";
import { createDeferred, createRunningCronServiceState } from "./service.test-harness.js";
import { computeJobNextRunAtMs } from "./service/jobs.js";
import { createCronServiceState, type CronEvent } from "./service/state.js";
import { executeJobCore, onTimer, runMissedJobs } from "./service/timer.js";
import { DEFAULT_JOB_TIMEOUT_MS, executeJobCore, onTimer, runMissedJobs } from "./service/timer.js";
import type { CronJob, CronJobState } from "./types.js";
const noopLogger = {
@@ -838,6 +838,58 @@ describe("Cron issue regressions", () => {
expect(job?.state.lastStatus).toBe("ok");
});
it("does not time out agentTurn jobs at the default 10-minute safety window", async () => {
const store = await makeStorePath();
const scheduledAt = Date.parse("2026-02-15T13:00:00.000Z");
const cronJob = createIsolatedRegressionJob({
id: "agentturn-default-safety-window",
name: "agentturn default safety window",
scheduledAt,
schedule: { kind: "at", at: new Date(scheduledAt).toISOString() },
payload: { kind: "agentTurn", message: "work" },
state: { nextRunAtMs: scheduledAt },
});
await writeCronJobs(store.storePath, [cronJob]);
let now = scheduledAt;
const deferredRun = createDeferred<{ status: "ok"; summary: string }>();
const runIsolatedAgentJob = vi.fn(async ({ abortSignal }: { abortSignal?: AbortSignal }) => {
const result = await deferredRun.promise;
if (abortSignal?.aborted) {
return { status: "error" as const, error: String(abortSignal.reason) };
}
now += 5;
return result;
});
const state = createCronServiceState({
cronEnabled: true,
storePath: store.storePath,
log: noopLogger,
nowMs: () => now,
enqueueSystemEvent: vi.fn(),
requestHeartbeatNow: vi.fn(),
runIsolatedAgentJob,
});
const timerPromise = onTimer(state);
let settled = false;
void timerPromise.finally(() => {
settled = true;
});
await vi.advanceTimersByTimeAsync(DEFAULT_JOB_TIMEOUT_MS + 1_000);
await Promise.resolve();
expect(settled).toBe(false);
deferredRun.resolve({ status: "ok", summary: "done" });
await timerPromise;
const job = state.store?.jobs.find((entry) => entry.id === "agentturn-default-safety-window");
expect(job?.state.lastStatus).toBe("ok");
expect(job?.state.lastError).toBeUndefined();
});
it("aborts isolated runs when cron timeout fires", async () => {
vi.useRealTimers();
const store = await makeStorePath();

View File

@@ -36,6 +36,7 @@ const MIN_REFIRE_GAP_MS = 2_000;
* from wedging the entire cron lane.
*/
export const DEFAULT_JOB_TIMEOUT_MS = 10 * 60_000; // 10 minutes
const AGENT_TURN_SAFETY_TIMEOUT_MS = 60 * 60_000; // 60 minutes
type TimedCronRunOutcome = CronRunOutcome &
CronRunTelemetry & {
@@ -52,7 +53,7 @@ function resolveCronJobTimeoutMs(job: CronJob): number | undefined {
? Math.floor(job.payload.timeoutSeconds * 1_000)
: undefined;
if (configuredTimeoutMs === undefined) {
return DEFAULT_JOB_TIMEOUT_MS;
return job.payload.kind === "agentTurn" ? AGENT_TURN_SAFETY_TIMEOUT_MS : DEFAULT_JOB_TIMEOUT_MS;
}
return configuredTimeoutMs <= 0 ? undefined : configuredTimeoutMs;
}