fix(cron): isolate schedule errors to prevent one bad job from breaking all jobs (#14385)

Previously, if one cron job had a malformed schedule expression (e.g. invalid cron syntax),
the error would propagate up and break the entire scheduler loop. This meant one misconfigured
job could prevent ALL cron jobs from running.

Changes:
- Wrap per-job schedule computation in try/catch in recomputeNextRuns()
- Track consecutive schedule errors via new scheduleErrorCount field
- Log warnings for schedule errors with job ID and name
- Auto-disable jobs after 3 consecutive schedule errors (with error-level log)
- Clear error count when schedule computation succeeds
- Continue processing other jobs even when one fails

This ensures the scheduler is resilient to individual job misconfigurations while still
providing visibility into problems through logging.

Co-authored-by: Marvin <numegilagent@gmail.com>
This commit is contained in:
MarvinDontPanic
2026-02-11 23:17:07 -05:00
committed by GitHub
parent ace5e33cee
commit 04f695e562
3 changed files with 223 additions and 3 deletions

View File

@@ -87,6 +87,9 @@ export function computeJobNextRunAtMs(job: CronJob, nowMs: number): number | und
return computeNextRunAtMs(job.schedule, nowMs);
}
/** Maximum consecutive schedule errors before auto-disabling a job. */
const MAX_SCHEDULE_ERRORS = 3;
export function recomputeNextRuns(state: CronServiceState): boolean {
if (!state.store) {
return false;
@@ -124,10 +127,36 @@ export function recomputeNextRuns(state: CronServiceState): boolean {
const nextRun = job.state.nextRunAtMs;
const isDueOrMissing = nextRun === undefined || now >= nextRun;
if (isDueOrMissing) {
const newNext = computeJobNextRunAtMs(job, now);
if (job.state.nextRunAtMs !== newNext) {
job.state.nextRunAtMs = newNext;
try {
const newNext = computeJobNextRunAtMs(job, now);
if (job.state.nextRunAtMs !== newNext) {
job.state.nextRunAtMs = newNext;
changed = true;
}
// Clear schedule error count on successful computation.
if (job.state.scheduleErrorCount) {
job.state.scheduleErrorCount = undefined;
changed = true;
}
} catch (err) {
const errorCount = (job.state.scheduleErrorCount ?? 0) + 1;
job.state.scheduleErrorCount = errorCount;
job.state.nextRunAtMs = undefined;
job.state.lastError = `schedule error: ${String(err)}`;
changed = true;
if (errorCount >= MAX_SCHEDULE_ERRORS) {
job.enabled = false;
state.deps.log.error(
{ jobId: job.id, name: job.name, errorCount, err: String(err) },
"cron: auto-disabled job after repeated schedule errors",
);
} else {
state.deps.log.warn(
{ jobId: job.id, name: job.name, errorCount, err: String(err) },
"cron: failed to compute next run for job (skipping)",
);
}
}
}
}