feat(cron): configurable failure alerts for repeated job errors (openclaw#24789) thanks @0xbrak

Verified:
- pnpm install --frozen-lockfile
- pnpm check
- pnpm test -- --run src/cron/service.failure-alert.test.ts src/cli/cron-cli.test.ts src/gateway/protocol/cron-validators.test.ts

Co-authored-by: 0xbrak <181251288+0xbrak@users.noreply.github.com>
Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
0xbrak
2026-03-01 09:18:15 -05:00
committed by GitHub
parent f902697bd5
commit 4637b90c07
18 changed files with 842 additions and 1 deletions

View File

@@ -9,6 +9,7 @@ import {
import type {
CronDelivery,
CronDeliveryPatch,
CronFailureAlert,
CronJob,
CronJobCreate,
CronJobPatch,
@@ -419,6 +420,7 @@ export function createJob(state: CronServiceState, input: CronJobCreate): CronJo
wakeMode: input.wakeMode,
payload: input.payload,
delivery: input.delivery,
failureAlert: input.failureAlert,
state: {
...input.state,
},
@@ -483,6 +485,9 @@ export function applyJobPatch(job: CronJob, patch: CronJobPatch) {
if (patch.delivery) {
job.delivery = mergeCronDelivery(job.delivery, patch.delivery);
}
if ("failureAlert" in patch) {
job.failureAlert = mergeCronFailureAlert(job.failureAlert, patch.failureAlert);
}
if (job.sessionTarget === "main" && job.delivery?.mode !== "webhook") {
job.delivery = undefined;
}
@@ -648,6 +653,42 @@ function mergeCronDelivery(
return next;
}
function mergeCronFailureAlert(
existing: CronFailureAlert | false | undefined,
patch: CronFailureAlert | false | undefined,
): CronFailureAlert | false | undefined {
if (patch === false) {
return false;
}
if (patch === undefined) {
return existing;
}
const base = existing === false || existing === undefined ? {} : existing;
const next: CronFailureAlert = { ...base };
if ("after" in patch) {
const after = typeof patch.after === "number" && Number.isFinite(patch.after) ? patch.after : 0;
next.after = after > 0 ? Math.floor(after) : undefined;
}
if ("channel" in patch) {
const channel = typeof patch.channel === "string" ? patch.channel.trim() : "";
next.channel = channel ? channel : undefined;
}
if ("to" in patch) {
const to = typeof patch.to === "string" ? patch.to.trim() : "";
next.to = to ? to : undefined;
}
if ("cooldownMs" in patch) {
const cooldownMs =
typeof patch.cooldownMs === "number" && Number.isFinite(patch.cooldownMs)
? patch.cooldownMs
: -1;
next.cooldownMs = cooldownMs >= 0 ? Math.floor(cooldownMs) : undefined;
}
return next;
}
export function isJobDue(job: CronJob, nowMs: number, opts: { forced: boolean }) {
if (!job.state) {
job.state = {};

View File

@@ -5,6 +5,7 @@ import type {
CronJob,
CronJobCreate,
CronJobPatch,
CronMessageChannel,
CronRunOutcome,
CronRunStatus,
CronRunTelemetry,
@@ -90,6 +91,12 @@ export type CronServiceDeps = {
} & CronRunOutcome &
CronRunTelemetry
>;
sendCronFailureAlert?: (params: {
job: CronJob;
text: string;
channel: CronMessageChannel;
to?: string;
}) => Promise<void>;
onEvent?: (evt: CronEvent) => void;
};

View File

@@ -6,6 +6,7 @@ import { sweepCronRunSessions } from "../session-reaper.js";
import type {
CronDeliveryStatus,
CronJob,
CronMessageChannel,
CronRunOutcome,
CronRunStatus,
CronRunTelemetry,
@@ -33,6 +34,8 @@ const MAX_TIMER_DELAY_MS = 60_000;
* but always breaks an infinite re-trigger cycle. (See #17821)
*/
const MIN_REFIRE_GAP_MS = 2_000;
const DEFAULT_FAILURE_ALERT_AFTER = 2;
const DEFAULT_FAILURE_ALERT_COOLDOWN_MS = 60 * 60_000; // 1 hour
type TimedCronRunOutcome = CronRunOutcome &
CronRunTelemetry & {
@@ -149,6 +152,106 @@ function resolveDeliveryStatus(params: { job: CronJob; delivered?: boolean }): C
return resolveCronDeliveryPlan(params.job).requested ? "unknown" : "not-requested";
}
function normalizeCronMessageChannel(input: unknown): CronMessageChannel | undefined {
if (typeof input !== "string") {
return undefined;
}
const channel = input.trim().toLowerCase();
return channel ? (channel as CronMessageChannel) : undefined;
}
function normalizeTo(input: unknown): string | undefined {
if (typeof input !== "string") {
return undefined;
}
const to = input.trim();
return to ? to : undefined;
}
function clampPositiveInt(value: unknown, fallback: number): number {
if (typeof value !== "number" || !Number.isFinite(value)) {
return fallback;
}
const floored = Math.floor(value);
return floored >= 1 ? floored : fallback;
}
function clampNonNegativeInt(value: unknown, fallback: number): number {
if (typeof value !== "number" || !Number.isFinite(value)) {
return fallback;
}
const floored = Math.floor(value);
return floored >= 0 ? floored : fallback;
}
function resolveFailureAlert(
state: CronServiceState,
job: CronJob,
): { after: number; cooldownMs: number; channel: CronMessageChannel; to?: string } | null {
const globalConfig = state.deps.cronConfig?.failureAlert;
const jobConfig = job.failureAlert === false ? undefined : job.failureAlert;
if (job.failureAlert === false) {
return null;
}
if (!jobConfig && globalConfig?.enabled !== true) {
return null;
}
return {
after: clampPositiveInt(jobConfig?.after ?? globalConfig?.after, DEFAULT_FAILURE_ALERT_AFTER),
cooldownMs: clampNonNegativeInt(
jobConfig?.cooldownMs ?? globalConfig?.cooldownMs,
DEFAULT_FAILURE_ALERT_COOLDOWN_MS,
),
channel:
normalizeCronMessageChannel(jobConfig?.channel) ??
normalizeCronMessageChannel(job.delivery?.channel) ??
"last",
to: normalizeTo(jobConfig?.to) ?? normalizeTo(job.delivery?.to),
};
}
function emitFailureAlert(
state: CronServiceState,
params: {
job: CronJob;
error?: string;
consecutiveErrors: number;
channel: CronMessageChannel;
to?: string;
},
) {
const safeJobName = params.job.name || params.job.id;
const truncatedError = (params.error?.trim() || "unknown error").slice(0, 200);
const text = [
`Cron job "${safeJobName}" failed ${params.consecutiveErrors} times`,
`Last error: ${truncatedError}`,
].join("\n");
if (state.deps.sendCronFailureAlert) {
void state.deps
.sendCronFailureAlert({
job: params.job,
text,
channel: params.channel,
to: params.to,
})
.catch((err) => {
state.deps.log.warn(
{ jobId: params.job.id, err: String(err) },
"cron: failure alert delivery failed",
);
});
return;
}
state.deps.enqueueSystemEvent(text, { agentId: params.job.agentId });
if (params.job.wakeMode === "now") {
state.deps.requestHeartbeatNow({ reason: `cron:${params.job.id}:failure-alert` });
}
}
/**
* Apply the result of a job execution to the job's state.
* Handles consecutive error tracking, exponential backoff, one-shot disable,
@@ -181,8 +284,26 @@ export function applyJobResult(
// Track consecutive errors for backoff / auto-disable.
if (result.status === "error") {
job.state.consecutiveErrors = (job.state.consecutiveErrors ?? 0) + 1;
const alertConfig = resolveFailureAlert(state, job);
if (alertConfig && job.state.consecutiveErrors >= alertConfig.after) {
const now = state.deps.nowMs();
const lastAlert = job.state.lastFailureAlertAtMs;
const inCooldown =
typeof lastAlert === "number" && now - lastAlert < Math.max(0, alertConfig.cooldownMs);
if (!inCooldown) {
emitFailureAlert(state, {
job,
error: result.error,
consecutiveErrors: job.state.consecutiveErrors,
channel: alertConfig.channel,
to: alertConfig.to,
});
job.state.lastFailureAlertAtMs = now;
}
}
} else {
job.state.consecutiveErrors = 0;
job.state.lastFailureAlertAtMs = undefined;
}
const shouldDelete =