fix(cron): share isolated announce flow + harden cron scheduling/delivery (#11641)

* fix(cron): comprehensive cron scheduling and delivery fixes

- Fix delivery target resolution for isolated agent cron jobs
- Improve schedule parsing and validation
- Add job retry logic and error handling
- Enhance cron ops with better state management
- Add timer improvements for more reliable cron execution
- Add cron event type to protocol schema
- Support cron events in heartbeat runner (skip empty-heartbeat check,
  use dedicated CRON_EVENT_PROMPT for relay)

* fix: remove cron debug test and add changelog/docs notes (#11641) (thanks @tyler6204)
This commit is contained in:
Tyler Yust
2026-02-07 19:46:01 -08:00
committed by GitHub
parent ebe5730401
commit 8fae55e8e0
19 changed files with 488 additions and 150 deletions

View File

@@ -118,10 +118,17 @@ export function recomputeNextRuns(state: CronServiceState): boolean {
job.state.runningAtMs = undefined;
changed = true;
}
const newNext = computeJobNextRunAtMs(job, now);
if (job.state.nextRunAtMs !== newNext) {
job.state.nextRunAtMs = newNext;
changed = true;
// Only recompute if nextRunAtMs is missing or already past-due.
// Preserving a still-future nextRunAtMs avoids accidentally advancing
// a job that hasn't fired yet (e.g. during restart recovery).
const nextRun = job.state.nextRunAtMs;
const isDueOrMissing = nextRun === undefined || now >= nextRun;
if (isDueOrMissing) {
const newNext = computeJobNextRunAtMs(job, now);
if (job.state.nextRunAtMs !== newNext) {
job.state.nextRunAtMs = newNext;
changed = true;
}
}
}
return changed;
@@ -380,6 +387,9 @@ function mergeCronDelivery(
}
export function isJobDue(job: CronJob, nowMs: number, opts: { forced: boolean }) {
if (!job.state) {
job.state = {};
}
if (typeof job.state.runningAtMs === "number") {
return false;
}

View File

@@ -52,6 +52,12 @@ export function stop(state: CronServiceState) {
export async function status(state: CronServiceState) {
return await locked(state, async () => {
await ensureLoaded(state, { skipRecompute: true });
if (state.store) {
const changed = recomputeNextRuns(state);
if (changed) {
await persist(state);
}
}
return {
enabled: state.deps.cronEnabled,
storePath: state.deps.storePath,
@@ -64,6 +70,12 @@ export async function status(state: CronServiceState) {
export async function list(state: CronServiceState, opts?: { includeDisabled?: boolean }) {
return await locked(state, async () => {
await ensureLoaded(state, { skipRecompute: true });
if (state.store) {
const changed = recomputeNextRuns(state);
if (changed) {
await persist(state);
}
}
const includeDisabled = opts?.includeDisabled === true;
const jobs = (state.store?.jobs ?? []).filter((j) => includeDisabled || j.enabled);
return jobs.toSorted((a, b) => (a.state.nextRunAtMs ?? 0) - (b.state.nextRunAtMs ?? 0));
@@ -76,8 +88,25 @@ export async function add(state: CronServiceState, input: CronJobCreate) {
await ensureLoaded(state);
const job = createJob(state, input);
state.store?.jobs.push(job);
// Defensive: recompute all next-run times to ensure consistency
recomputeNextRuns(state);
await persist(state);
armTimer(state);
state.deps.log.info(
{
jobId: job.id,
jobName: job.name,
nextRunAtMs: job.state.nextRunAtMs,
schedulerNextWakeAtMs: nextWakeAtMs(state) ?? null,
timerArmed: state.timer !== null,
cronEnabled: state.deps.cronEnabled,
},
"cron: job added",
);
emit(state, {
jobId: job.id,
action: "added",
@@ -110,12 +139,17 @@ export async function update(state: CronServiceState, id: string, patch: CronJob
};
}
}
const scheduleChanged = patch.schedule !== undefined;
const enabledChanged = patch.enabled !== undefined;
job.updatedAtMs = now;
if (job.enabled) {
job.state.nextRunAtMs = computeJobNextRunAtMs(job, now);
} else {
job.state.nextRunAtMs = undefined;
job.state.runningAtMs = undefined;
if (scheduleChanged || enabledChanged) {
if (job.enabled) {
job.state.nextRunAtMs = computeJobNextRunAtMs(job, now);
} else {
job.state.nextRunAtMs = undefined;
job.state.runningAtMs = undefined;
}
}
await persist(state);

View File

@@ -13,19 +13,131 @@ import { ensureLoaded, persist } from "./store.js";
const MAX_TIMER_DELAY_MS = 60_000;
/**
* Maximum wall-clock time for a single job execution. Acts as a safety net
* on top of the per-provider / per-agent timeouts to prevent one stuck job
* from wedging the entire cron lane.
*/
const DEFAULT_JOB_TIMEOUT_MS = 10 * 60_000; // 10 minutes
/**
* Exponential backoff delays (in ms) indexed by consecutive error count.
* After the last entry the delay stays constant.
*/
const ERROR_BACKOFF_SCHEDULE_MS = [
30_000, // 1st error → 30 s
60_000, // 2nd error → 1 min
5 * 60_000, // 3rd error → 5 min
15 * 60_000, // 4th error → 15 min
60 * 60_000, // 5th+ error → 60 min
];
function errorBackoffMs(consecutiveErrors: number): number {
const idx = Math.min(consecutiveErrors - 1, ERROR_BACKOFF_SCHEDULE_MS.length - 1);
return ERROR_BACKOFF_SCHEDULE_MS[Math.max(0, idx)];
}
/**
* Apply the result of a job execution to the job's state.
* Handles consecutive error tracking, exponential backoff, one-shot disable,
* and nextRunAtMs computation. Returns `true` if the job should be deleted.
*/
function applyJobResult(
state: CronServiceState,
job: CronJob,
result: {
status: "ok" | "error" | "skipped";
error?: string;
startedAt: number;
endedAt: number;
},
): boolean {
job.state.runningAtMs = undefined;
job.state.lastRunAtMs = result.startedAt;
job.state.lastStatus = result.status;
job.state.lastDurationMs = Math.max(0, result.endedAt - result.startedAt);
job.state.lastError = result.error;
job.updatedAtMs = result.endedAt;
// Track consecutive errors for backoff / auto-disable.
if (result.status === "error") {
job.state.consecutiveErrors = (job.state.consecutiveErrors ?? 0) + 1;
} else {
job.state.consecutiveErrors = 0;
}
const shouldDelete =
job.schedule.kind === "at" && result.status === "ok" && job.deleteAfterRun === true;
if (!shouldDelete) {
if (job.schedule.kind === "at") {
// One-shot jobs are always disabled after ANY terminal status
// (ok, error, or skipped). This prevents tight-loop rescheduling
// when computeJobNextRunAtMs returns the past atMs value (#11452).
job.enabled = false;
job.state.nextRunAtMs = undefined;
if (result.status === "error") {
state.deps.log.warn(
{
jobId: job.id,
jobName: job.name,
consecutiveErrors: job.state.consecutiveErrors,
error: result.error,
},
"cron: disabling one-shot job after error",
);
}
} else if (result.status === "error" && job.enabled) {
// Apply exponential backoff for errored jobs to prevent retry storms.
const backoff = errorBackoffMs(job.state.consecutiveErrors ?? 1);
const normalNext = computeJobNextRunAtMs(job, result.endedAt);
const backoffNext = result.endedAt + backoff;
// Use whichever is later: the natural next run or the backoff delay.
job.state.nextRunAtMs =
normalNext !== undefined ? Math.max(normalNext, backoffNext) : backoffNext;
state.deps.log.info(
{
jobId: job.id,
consecutiveErrors: job.state.consecutiveErrors,
backoffMs: backoff,
nextRunAtMs: job.state.nextRunAtMs,
},
"cron: applying error backoff",
);
} else if (job.enabled) {
job.state.nextRunAtMs = computeJobNextRunAtMs(job, result.endedAt);
} else {
job.state.nextRunAtMs = undefined;
}
}
return shouldDelete;
}
export function armTimer(state: CronServiceState) {
if (state.timer) {
clearTimeout(state.timer);
}
state.timer = null;
if (!state.deps.cronEnabled) {
state.deps.log.debug({}, "cron: armTimer skipped - scheduler disabled");
return;
}
const nextAt = nextWakeAtMs(state);
if (!nextAt) {
const jobCount = state.store?.jobs.length ?? 0;
const enabledCount = state.store?.jobs.filter((j) => j.enabled).length ?? 0;
const withNextRun =
state.store?.jobs.filter((j) => j.enabled && typeof j.state.nextRunAtMs === "number")
.length ?? 0;
state.deps.log.debug(
{ jobCount, enabledCount, withNextRun },
"cron: armTimer skipped - no jobs with nextRunAtMs",
);
return;
}
const delay = Math.max(nextAt - state.deps.nowMs(), 0);
const now = state.deps.nowMs();
const delay = Math.max(nextAt - now, 0);
// Wake at least once a minute to avoid schedule drift and recover quickly
// when the process was paused or wall-clock time jumps.
const clampedDelay = Math.min(delay, MAX_TIMER_DELAY_MS);
@@ -36,6 +148,10 @@ export function armTimer(state: CronServiceState) {
state.deps.log.error({ err: String(err) }, "cron: timer tick failed");
}
}, clampedDelay);
state.deps.log.debug(
{ nextAt, delayMs: clampedDelay, clamped: delay > MAX_TIMER_DELAY_MS },
"cron: timer armed",
);
}
export async function onTimer(state: CronServiceState) {
@@ -84,10 +200,29 @@ export async function onTimer(state: CronServiceState) {
const startedAt = state.deps.nowMs();
job.state.runningAtMs = startedAt;
emit(state, { jobId: job.id, action: "started", runAtMs: startedAt });
const jobTimeoutMs =
job.payload.kind === "agentTurn" && typeof job.payload.timeoutSeconds === "number"
? job.payload.timeoutSeconds * 1_000
: DEFAULT_JOB_TIMEOUT_MS;
try {
const result = await executeJobCore(state, job);
let timeoutId: NodeJS.Timeout;
const result = await Promise.race([
executeJobCore(state, job),
new Promise<never>((_, reject) => {
timeoutId = setTimeout(
() => reject(new Error("cron: job execution timed out")),
jobTimeoutMs,
);
}),
]).finally(() => clearTimeout(timeoutId!));
results.push({ jobId: id, ...result, startedAt, endedAt: state.deps.nowMs() });
} catch (err) {
state.deps.log.warn(
{ jobId: id, jobName: job.name, timeoutMs: jobTimeoutMs },
`cron: job failed: ${String(err)}`,
);
results.push({
jobId: id,
status: "error",
@@ -108,26 +243,12 @@ export async function onTimer(state: CronServiceState) {
continue;
}
const startedAt = result.startedAt;
job.state.runningAtMs = undefined;
job.state.lastRunAtMs = startedAt;
job.state.lastStatus = result.status;
job.state.lastDurationMs = Math.max(0, result.endedAt - startedAt);
job.state.lastError = result.error;
const shouldDelete =
job.schedule.kind === "at" && result.status === "ok" && job.deleteAfterRun === true;
if (!shouldDelete) {
if (job.schedule.kind === "at" && result.status === "ok") {
job.enabled = false;
job.state.nextRunAtMs = undefined;
} else if (job.enabled) {
job.state.nextRunAtMs = computeJobNextRunAtMs(job, result.endedAt);
} else {
job.state.nextRunAtMs = undefined;
}
}
const shouldDelete = applyJobResult(state, job, {
status: result.status,
error: result.error,
startedAt: result.startedAt,
endedAt: result.endedAt,
});
emit(state, {
jobId: job.id,
@@ -137,7 +258,7 @@ export async function onTimer(state: CronServiceState) {
summary: result.summary,
sessionId: result.sessionId,
sessionKey: result.sessionKey,
runAtMs: startedAt,
runAtMs: result.startedAt,
durationMs: job.state.lastDurationMs,
nextRunAtMs: job.state.nextRunAtMs,
});
@@ -146,8 +267,6 @@ export async function onTimer(state: CronServiceState) {
state.store.jobs = state.store.jobs.filter((j) => j.id !== job.id);
emit(state, { jobId: job.id, action: "removed" });
}
job.updatedAtMs = result.endedAt;
}
recomputeNextRuns(state);
@@ -166,6 +285,9 @@ function findDueJobs(state: CronServiceState): CronJob[] {
}
const now = state.deps.nowMs();
return state.store.jobs.filter((j) => {
if (!j.state) {
j.state = {};
}
if (!j.enabled) {
return false;
}
@@ -183,6 +305,9 @@ export async function runMissedJobs(state: CronServiceState) {
}
const now = state.deps.nowMs();
const missed = state.store.jobs.filter((j) => {
if (!j.state) {
j.state = {};
}
if (!j.enabled) {
return false;
}
@@ -213,6 +338,9 @@ export async function runDueJobs(state: CronServiceState) {
}
const now = state.deps.nowMs();
const due = state.store.jobs.filter((j) => {
if (!j.state) {
j.state = {};
}
if (!j.enabled) {
return false;
}
@@ -323,76 +451,54 @@ async function executeJobCore(
export async function executeJob(
state: CronServiceState,
job: CronJob,
nowMs: number,
opts: { forced: boolean },
_nowMs: number,
_opts: { forced: boolean },
) {
if (!job.state) {
job.state = {};
}
const startedAt = state.deps.nowMs();
job.state.runningAtMs = startedAt;
job.state.lastError = undefined;
emit(state, { jobId: job.id, action: "started", runAtMs: startedAt });
let deleted = false;
const finish = async (
status: "ok" | "error" | "skipped",
err?: string,
summary?: string,
session?: { sessionId?: string; sessionKey?: string },
) => {
const endedAt = state.deps.nowMs();
job.state.runningAtMs = undefined;
job.state.lastRunAtMs = startedAt;
job.state.lastStatus = status;
job.state.lastDurationMs = Math.max(0, endedAt - startedAt);
job.state.lastError = err;
const shouldDelete =
job.schedule.kind === "at" && status === "ok" && job.deleteAfterRun === true;
if (!shouldDelete) {
if (job.schedule.kind === "at" && status === "ok") {
job.enabled = false;
job.state.nextRunAtMs = undefined;
} else if (job.enabled) {
job.state.nextRunAtMs = computeJobNextRunAtMs(job, endedAt);
} else {
job.state.nextRunAtMs = undefined;
}
}
emit(state, {
jobId: job.id,
action: "finished",
status,
error: err,
summary,
sessionId: session?.sessionId,
sessionKey: session?.sessionKey,
runAtMs: startedAt,
durationMs: job.state.lastDurationMs,
nextRunAtMs: job.state.nextRunAtMs,
});
if (shouldDelete && state.store) {
state.store.jobs = state.store.jobs.filter((j) => j.id !== job.id);
deleted = true;
emit(state, { jobId: job.id, action: "removed" });
}
let coreResult: {
status: "ok" | "error" | "skipped";
error?: string;
summary?: string;
sessionId?: string;
sessionKey?: string;
};
try {
const result = await executeJobCore(state, job);
await finish(result.status, result.error, result.summary, {
sessionId: result.sessionId,
sessionKey: result.sessionKey,
});
coreResult = await executeJobCore(state, job);
} catch (err) {
await finish("error", String(err));
} finally {
job.updatedAtMs = nowMs;
if (!opts.forced && job.enabled && !deleted) {
job.state.nextRunAtMs = computeJobNextRunAtMs(job, state.deps.nowMs());
}
coreResult = { status: "error", error: String(err) };
}
const endedAt = state.deps.nowMs();
const shouldDelete = applyJobResult(state, job, {
status: coreResult.status,
error: coreResult.error,
startedAt,
endedAt,
});
emit(state, {
jobId: job.id,
action: "finished",
status: coreResult.status,
error: coreResult.error,
summary: coreResult.summary,
sessionId: coreResult.sessionId,
sessionKey: coreResult.sessionKey,
runAtMs: startedAt,
durationMs: job.state.lastDurationMs,
nextRunAtMs: job.state.nextRunAtMs,
});
if (shouldDelete && state.store) {
state.store.jobs = state.store.jobs.filter((j) => j.id !== job.id);
emit(state, { jobId: job.id, action: "removed" });
}
}