fix(queue): harden drain/abort/timeout race handling

- reject new lane enqueues once gateway drain begins
- always reset lane draining state and isolate onWait callback failures
- persist per-session abort cutoff and skip stale queued messages
- avoid false 600s agentTurn timeout in isolated cron jobs

Fixes #27407
Fixes #27332
Fixes #27427

Co-authored-by: Kevin Shenghui <shenghuikevin@github.com>
Co-authored-by: zjmy <zhangjunmengyang@gmail.com>
Co-authored-by: suko <miha.sukic@gmail.com>
This commit is contained in:
Peter Steinberger
2026-02-26 13:43:30 +01:00
parent 1aef45bc06
commit c397a02c9a
13 changed files with 551 additions and 42 deletions

View File

@@ -36,6 +36,7 @@ const MIN_REFIRE_GAP_MS = 2_000;
* from wedging the entire cron lane.
*/
export const DEFAULT_JOB_TIMEOUT_MS = 10 * 60_000; // 10 minutes
const AGENT_TURN_SAFETY_TIMEOUT_MS = 60 * 60_000; // 60 minutes
type TimedCronRunOutcome = CronRunOutcome &
CronRunTelemetry & {
@@ -52,7 +53,7 @@ function resolveCronJobTimeoutMs(job: CronJob): number | undefined {
? Math.floor(job.payload.timeoutSeconds * 1_000)
: undefined;
if (configuredTimeoutMs === undefined) {
return DEFAULT_JOB_TIMEOUT_MS;
return job.payload.kind === "agentTurn" ? AGENT_TURN_SAFETY_TIMEOUT_MS : DEFAULT_JOB_TIMEOUT_MS;
}
return configuredTimeoutMs <= 0 ? undefined : configuredTimeoutMs;
}