mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-13 17:46:37 +00:00
feat(cron): configurable failure alerts for repeated job errors (openclaw#24789) thanks @0xbrak
Verified: - pnpm install --frozen-lockfile - pnpm check - pnpm test -- --run src/cron/service.failure-alert.test.ts src/cli/cron-cli.test.ts src/gateway/protocol/cron-validators.test.ts Co-authored-by: 0xbrak <181251288+0xbrak@users.noreply.github.com> Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
@@ -9,6 +9,7 @@ import {
|
||||
import type {
|
||||
CronDelivery,
|
||||
CronDeliveryPatch,
|
||||
CronFailureAlert,
|
||||
CronJob,
|
||||
CronJobCreate,
|
||||
CronJobPatch,
|
||||
@@ -419,6 +420,7 @@ export function createJob(state: CronServiceState, input: CronJobCreate): CronJo
|
||||
wakeMode: input.wakeMode,
|
||||
payload: input.payload,
|
||||
delivery: input.delivery,
|
||||
failureAlert: input.failureAlert,
|
||||
state: {
|
||||
...input.state,
|
||||
},
|
||||
@@ -483,6 +485,9 @@ export function applyJobPatch(job: CronJob, patch: CronJobPatch) {
|
||||
if (patch.delivery) {
|
||||
job.delivery = mergeCronDelivery(job.delivery, patch.delivery);
|
||||
}
|
||||
if ("failureAlert" in patch) {
|
||||
job.failureAlert = mergeCronFailureAlert(job.failureAlert, patch.failureAlert);
|
||||
}
|
||||
if (job.sessionTarget === "main" && job.delivery?.mode !== "webhook") {
|
||||
job.delivery = undefined;
|
||||
}
|
||||
@@ -648,6 +653,42 @@ function mergeCronDelivery(
|
||||
return next;
|
||||
}
|
||||
|
||||
function mergeCronFailureAlert(
|
||||
existing: CronFailureAlert | false | undefined,
|
||||
patch: CronFailureAlert | false | undefined,
|
||||
): CronFailureAlert | false | undefined {
|
||||
if (patch === false) {
|
||||
return false;
|
||||
}
|
||||
if (patch === undefined) {
|
||||
return existing;
|
||||
}
|
||||
const base = existing === false || existing === undefined ? {} : existing;
|
||||
const next: CronFailureAlert = { ...base };
|
||||
|
||||
if ("after" in patch) {
|
||||
const after = typeof patch.after === "number" && Number.isFinite(patch.after) ? patch.after : 0;
|
||||
next.after = after > 0 ? Math.floor(after) : undefined;
|
||||
}
|
||||
if ("channel" in patch) {
|
||||
const channel = typeof patch.channel === "string" ? patch.channel.trim() : "";
|
||||
next.channel = channel ? channel : undefined;
|
||||
}
|
||||
if ("to" in patch) {
|
||||
const to = typeof patch.to === "string" ? patch.to.trim() : "";
|
||||
next.to = to ? to : undefined;
|
||||
}
|
||||
if ("cooldownMs" in patch) {
|
||||
const cooldownMs =
|
||||
typeof patch.cooldownMs === "number" && Number.isFinite(patch.cooldownMs)
|
||||
? patch.cooldownMs
|
||||
: -1;
|
||||
next.cooldownMs = cooldownMs >= 0 ? Math.floor(cooldownMs) : undefined;
|
||||
}
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
export function isJobDue(job: CronJob, nowMs: number, opts: { forced: boolean }) {
|
||||
if (!job.state) {
|
||||
job.state = {};
|
||||
|
||||
@@ -5,6 +5,7 @@ import type {
|
||||
CronJob,
|
||||
CronJobCreate,
|
||||
CronJobPatch,
|
||||
CronMessageChannel,
|
||||
CronRunOutcome,
|
||||
CronRunStatus,
|
||||
CronRunTelemetry,
|
||||
@@ -90,6 +91,12 @@ export type CronServiceDeps = {
|
||||
} & CronRunOutcome &
|
||||
CronRunTelemetry
|
||||
>;
|
||||
sendCronFailureAlert?: (params: {
|
||||
job: CronJob;
|
||||
text: string;
|
||||
channel: CronMessageChannel;
|
||||
to?: string;
|
||||
}) => Promise<void>;
|
||||
onEvent?: (evt: CronEvent) => void;
|
||||
};
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import { sweepCronRunSessions } from "../session-reaper.js";
|
||||
import type {
|
||||
CronDeliveryStatus,
|
||||
CronJob,
|
||||
CronMessageChannel,
|
||||
CronRunOutcome,
|
||||
CronRunStatus,
|
||||
CronRunTelemetry,
|
||||
@@ -33,6 +34,8 @@ const MAX_TIMER_DELAY_MS = 60_000;
|
||||
* but always breaks an infinite re-trigger cycle. (See #17821)
|
||||
*/
|
||||
const MIN_REFIRE_GAP_MS = 2_000;
|
||||
const DEFAULT_FAILURE_ALERT_AFTER = 2;
|
||||
const DEFAULT_FAILURE_ALERT_COOLDOWN_MS = 60 * 60_000; // 1 hour
|
||||
|
||||
type TimedCronRunOutcome = CronRunOutcome &
|
||||
CronRunTelemetry & {
|
||||
@@ -149,6 +152,106 @@ function resolveDeliveryStatus(params: { job: CronJob; delivered?: boolean }): C
|
||||
return resolveCronDeliveryPlan(params.job).requested ? "unknown" : "not-requested";
|
||||
}
|
||||
|
||||
function normalizeCronMessageChannel(input: unknown): CronMessageChannel | undefined {
|
||||
if (typeof input !== "string") {
|
||||
return undefined;
|
||||
}
|
||||
const channel = input.trim().toLowerCase();
|
||||
return channel ? (channel as CronMessageChannel) : undefined;
|
||||
}
|
||||
|
||||
function normalizeTo(input: unknown): string | undefined {
|
||||
if (typeof input !== "string") {
|
||||
return undefined;
|
||||
}
|
||||
const to = input.trim();
|
||||
return to ? to : undefined;
|
||||
}
|
||||
|
||||
function clampPositiveInt(value: unknown, fallback: number): number {
|
||||
if (typeof value !== "number" || !Number.isFinite(value)) {
|
||||
return fallback;
|
||||
}
|
||||
const floored = Math.floor(value);
|
||||
return floored >= 1 ? floored : fallback;
|
||||
}
|
||||
|
||||
function clampNonNegativeInt(value: unknown, fallback: number): number {
|
||||
if (typeof value !== "number" || !Number.isFinite(value)) {
|
||||
return fallback;
|
||||
}
|
||||
const floored = Math.floor(value);
|
||||
return floored >= 0 ? floored : fallback;
|
||||
}
|
||||
|
||||
function resolveFailureAlert(
|
||||
state: CronServiceState,
|
||||
job: CronJob,
|
||||
): { after: number; cooldownMs: number; channel: CronMessageChannel; to?: string } | null {
|
||||
const globalConfig = state.deps.cronConfig?.failureAlert;
|
||||
const jobConfig = job.failureAlert === false ? undefined : job.failureAlert;
|
||||
|
||||
if (job.failureAlert === false) {
|
||||
return null;
|
||||
}
|
||||
if (!jobConfig && globalConfig?.enabled !== true) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
after: clampPositiveInt(jobConfig?.after ?? globalConfig?.after, DEFAULT_FAILURE_ALERT_AFTER),
|
||||
cooldownMs: clampNonNegativeInt(
|
||||
jobConfig?.cooldownMs ?? globalConfig?.cooldownMs,
|
||||
DEFAULT_FAILURE_ALERT_COOLDOWN_MS,
|
||||
),
|
||||
channel:
|
||||
normalizeCronMessageChannel(jobConfig?.channel) ??
|
||||
normalizeCronMessageChannel(job.delivery?.channel) ??
|
||||
"last",
|
||||
to: normalizeTo(jobConfig?.to) ?? normalizeTo(job.delivery?.to),
|
||||
};
|
||||
}
|
||||
|
||||
function emitFailureAlert(
|
||||
state: CronServiceState,
|
||||
params: {
|
||||
job: CronJob;
|
||||
error?: string;
|
||||
consecutiveErrors: number;
|
||||
channel: CronMessageChannel;
|
||||
to?: string;
|
||||
},
|
||||
) {
|
||||
const safeJobName = params.job.name || params.job.id;
|
||||
const truncatedError = (params.error?.trim() || "unknown error").slice(0, 200);
|
||||
const text = [
|
||||
`Cron job "${safeJobName}" failed ${params.consecutiveErrors} times`,
|
||||
`Last error: ${truncatedError}`,
|
||||
].join("\n");
|
||||
|
||||
if (state.deps.sendCronFailureAlert) {
|
||||
void state.deps
|
||||
.sendCronFailureAlert({
|
||||
job: params.job,
|
||||
text,
|
||||
channel: params.channel,
|
||||
to: params.to,
|
||||
})
|
||||
.catch((err) => {
|
||||
state.deps.log.warn(
|
||||
{ jobId: params.job.id, err: String(err) },
|
||||
"cron: failure alert delivery failed",
|
||||
);
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
state.deps.enqueueSystemEvent(text, { agentId: params.job.agentId });
|
||||
if (params.job.wakeMode === "now") {
|
||||
state.deps.requestHeartbeatNow({ reason: `cron:${params.job.id}:failure-alert` });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply the result of a job execution to the job's state.
|
||||
* Handles consecutive error tracking, exponential backoff, one-shot disable,
|
||||
@@ -181,8 +284,26 @@ export function applyJobResult(
|
||||
// Track consecutive errors for backoff / auto-disable.
|
||||
if (result.status === "error") {
|
||||
job.state.consecutiveErrors = (job.state.consecutiveErrors ?? 0) + 1;
|
||||
const alertConfig = resolveFailureAlert(state, job);
|
||||
if (alertConfig && job.state.consecutiveErrors >= alertConfig.after) {
|
||||
const now = state.deps.nowMs();
|
||||
const lastAlert = job.state.lastFailureAlertAtMs;
|
||||
const inCooldown =
|
||||
typeof lastAlert === "number" && now - lastAlert < Math.max(0, alertConfig.cooldownMs);
|
||||
if (!inCooldown) {
|
||||
emitFailureAlert(state, {
|
||||
job,
|
||||
error: result.error,
|
||||
consecutiveErrors: job.state.consecutiveErrors,
|
||||
channel: alertConfig.channel,
|
||||
to: alertConfig.to,
|
||||
});
|
||||
job.state.lastFailureAlertAtMs = now;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
job.state.consecutiveErrors = 0;
|
||||
job.state.lastFailureAlertAtMs = undefined;
|
||||
}
|
||||
|
||||
const shouldDelete =
|
||||
|
||||
Reference in New Issue
Block a user