mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-19 09:58:38 +00:00
fix(cron): add retry policy for one-shot jobs on transient errors (#24355) (openclaw#24435) thanks @hugenshen
Verified: - pnpm install --frozen-lockfile - pnpm check - pnpm test -- --run src/cron/service.issue-regressions.test.ts src/config/config-misc.test.ts Co-authored-by: hugenshen <16300669+hugenshen@users.noreply.github.com> Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
@@ -77,6 +77,7 @@ Docs: https://docs.openclaw.ai
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
- Cron/One-shot reliability: retry transient one-shot failures with bounded backoff and configurable retry policy before disabling. (#24435) Thanks .
|
||||||
- Gateway/Cron auditability: add gateway info logs for successful cron create, update, and remove operations. (#25090) Thanks .
|
- Gateway/Cron auditability: add gateway info logs for successful cron create, update, and remove operations. (#25090) Thanks .
|
||||||
- Cron/Schedule errors: notify users when a job is auto-disabled after repeated schedule computation failures. (#29098) Thanks .
|
- Cron/Schedule errors: notify users when a job is auto-disabled after repeated schedule computation failures. (#29098) Thanks .
|
||||||
- Cron/Schedule errors: notify users when a job is auto-disabled after repeated schedule computation failures. (#29098) Thanks .
|
- Cron/Schedule errors: notify users when a job is auto-disabled after repeated schedule computation failures. (#29098) Thanks .
|
||||||
|
|||||||
@@ -353,6 +353,38 @@ Notes:
|
|||||||
- Isolated cron run sessions in `sessions.json` are pruned by `cron.sessionRetention` (default `24h`; set `false` to disable).
|
- Isolated cron run sessions in `sessions.json` are pruned by `cron.sessionRetention` (default `24h`; set `false` to disable).
|
||||||
- Override store path: `cron.store` in config.
|
- Override store path: `cron.store` in config.
|
||||||
|
|
||||||
|
## Retry policy
|
||||||
|
|
||||||
|
When a job fails, OpenClaw classifies errors as **transient** (retryable) or **permanent** (disable immediately).
|
||||||
|
|
||||||
|
### Transient errors (retried)
|
||||||
|
|
||||||
|
- Rate limit (429, too many requests, resource exhausted)
|
||||||
|
- Network errors (timeout, ECONNRESET, fetch failed, socket)
|
||||||
|
- Server errors (5xx)
|
||||||
|
- Cloudflare-related errors
|
||||||
|
|
||||||
|
### Permanent errors (no retry)
|
||||||
|
|
||||||
|
- Auth failures (invalid API key, unauthorized)
|
||||||
|
- Config or validation errors
|
||||||
|
- Other non-transient errors
|
||||||
|
|
||||||
|
### Default behavior (no config)
|
||||||
|
|
||||||
|
**One-shot jobs (`schedule.kind: "at"`):**
|
||||||
|
|
||||||
|
- On transient error: retry up to 3 times with exponential backoff (30s → 1m → 5m).
|
||||||
|
- On permanent error: disable immediately.
|
||||||
|
- On success or skip: disable (or delete if `deleteAfterRun: true`).
|
||||||
|
|
||||||
|
**Recurring jobs (`cron` / `every`):**
|
||||||
|
|
||||||
|
- On any error: apply exponential backoff (30s → 1m → 5m → 15m → 60m) before the next scheduled run.
|
||||||
|
- Job stays enabled; backoff resets after the next successful run.
|
||||||
|
|
||||||
|
Configure `cron.retry` to override these defaults (see [Configuration](/automation/cron-jobs#configuration)).
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
```json5
|
```json5
|
||||||
@@ -361,6 +393,12 @@ Notes:
|
|||||||
enabled: true, // default true
|
enabled: true, // default true
|
||||||
store: "~/.openclaw/cron/jobs.json",
|
store: "~/.openclaw/cron/jobs.json",
|
||||||
maxConcurrentRuns: 1, // default 1
|
maxConcurrentRuns: 1, // default 1
|
||||||
|
// Optional: override retry policy for one-shot jobs
|
||||||
|
retry: {
|
||||||
|
maxAttempts: 3,
|
||||||
|
backoffMs: [60000, 120000, 300000],
|
||||||
|
retryOn: ["rate_limit", "network", "server_error"],
|
||||||
|
},
|
||||||
webhook: "https://example.invalid/legacy", // deprecated fallback for stored notify:true jobs
|
webhook: "https://example.invalid/legacy", // deprecated fallback for stored notify:true jobs
|
||||||
webhookToken: "replace-with-dedicated-webhook-token", // optional bearer token for webhook mode
|
webhookToken: "replace-with-dedicated-webhook-token", // optional bearer token for webhook mode
|
||||||
sessionRetention: "24h", // duration string or false
|
sessionRetention: "24h", // duration string or false
|
||||||
@@ -617,7 +655,7 @@ openclaw system event --mode now --text "Next heartbeat: check battery."
|
|||||||
- OpenClaw applies exponential retry backoff for recurring jobs after consecutive errors:
|
- OpenClaw applies exponential retry backoff for recurring jobs after consecutive errors:
|
||||||
30s, 1m, 5m, 15m, then 60m between retries.
|
30s, 1m, 5m, 15m, then 60m between retries.
|
||||||
- Backoff resets automatically after the next successful run.
|
- Backoff resets automatically after the next successful run.
|
||||||
- One-shot (`at`) jobs disable after a terminal run (`ok`, `error`, or `skipped`) and do not retry.
|
- One-shot (`at`) jobs retry transient errors (rate limit, network, server_error) up to 3 times with backoff; permanent errors disable immediately. See [Retry policy](/automation/cron-jobs#retry-policy).
|
||||||
|
|
||||||
### Telegram delivers to the wrong place
|
### Telegram delivers to the wrong place
|
||||||
|
|
||||||
|
|||||||
@@ -193,6 +193,19 @@ describe("cron webhook schema", () => {
|
|||||||
|
|
||||||
expect(res.success).toBe(false);
|
expect(res.success).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("accepts cron.retry config", () => {
|
||||||
|
const res = OpenClawSchema.safeParse({
|
||||||
|
cron: {
|
||||||
|
retry: {
|
||||||
|
maxAttempts: 5,
|
||||||
|
backoffMs: [60000, 120000, 300000],
|
||||||
|
retryOn: ["rate_limit", "network"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
expect(res.success).toBe(true);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("broadcast", () => {
|
describe("broadcast", () => {
|
||||||
|
|||||||
@@ -108,6 +108,10 @@ const TARGET_KEYS = [
|
|||||||
"cron.enabled",
|
"cron.enabled",
|
||||||
"cron.store",
|
"cron.store",
|
||||||
"cron.maxConcurrentRuns",
|
"cron.maxConcurrentRuns",
|
||||||
|
"cron.retry",
|
||||||
|
"cron.retry.maxAttempts",
|
||||||
|
"cron.retry.backoffMs",
|
||||||
|
"cron.retry.retryOn",
|
||||||
"cron.webhook",
|
"cron.webhook",
|
||||||
"cron.webhookToken",
|
"cron.webhookToken",
|
||||||
"cron.sessionRetention",
|
"cron.sessionRetention",
|
||||||
|
|||||||
@@ -1064,6 +1064,14 @@ export const FIELD_HELP: Record<string, string> = {
|
|||||||
"Path to the cron job store file used to persist scheduled jobs across restarts. Set an explicit path only when you need custom storage layout, backups, or mounted volumes.",
|
"Path to the cron job store file used to persist scheduled jobs across restarts. Set an explicit path only when you need custom storage layout, backups, or mounted volumes.",
|
||||||
"cron.maxConcurrentRuns":
|
"cron.maxConcurrentRuns":
|
||||||
"Limits how many cron jobs can execute at the same time when multiple schedules fire together. Use lower values to protect CPU/memory under heavy automation load, or raise carefully for higher throughput.",
|
"Limits how many cron jobs can execute at the same time when multiple schedules fire together. Use lower values to protect CPU/memory under heavy automation load, or raise carefully for higher throughput.",
|
||||||
|
"cron.retry":
|
||||||
|
"Overrides the default retry policy for one-shot jobs when they fail with transient errors (rate limit, network, server_error). Omit to use defaults: maxAttempts 3, backoffMs [30000, 60000, 300000], retry all transient types.",
|
||||||
|
"cron.retry.maxAttempts":
|
||||||
|
"Max retries for one-shot jobs on transient errors before permanent disable (default: 3).",
|
||||||
|
"cron.retry.backoffMs":
|
||||||
|
"Backoff delays in ms for each retry attempt (default: [30000, 60000, 300000]). Use shorter values for faster retries.",
|
||||||
|
"cron.retry.retryOn":
|
||||||
|
"Error types to retry: rate_limit, network, timeout, server_error. Use to restrict which errors trigger retries; omit to retry all transient types.",
|
||||||
"cron.webhook":
|
"cron.webhook":
|
||||||
'Deprecated legacy fallback webhook URL used only for old jobs with `notify=true`. Migrate to per-job delivery using `delivery.mode="webhook"` plus `delivery.to`, and avoid relying on this global field.',
|
'Deprecated legacy fallback webhook URL used only for old jobs with `notify=true`. Migrate to per-job delivery using `delivery.mode="webhook"` plus `delivery.to`, and avoid relying on this global field.',
|
||||||
"cron.webhookToken":
|
"cron.webhookToken":
|
||||||
|
|||||||
@@ -504,6 +504,10 @@ export const FIELD_LABELS: Record<string, string> = {
|
|||||||
"cron.enabled": "Cron Enabled",
|
"cron.enabled": "Cron Enabled",
|
||||||
"cron.store": "Cron Store Path",
|
"cron.store": "Cron Store Path",
|
||||||
"cron.maxConcurrentRuns": "Cron Max Concurrent Runs",
|
"cron.maxConcurrentRuns": "Cron Max Concurrent Runs",
|
||||||
|
"cron.retry": "Cron Retry Policy",
|
||||||
|
"cron.retry.maxAttempts": "Cron Retry Max Attempts",
|
||||||
|
"cron.retry.backoffMs": "Cron Retry Backoff (ms)",
|
||||||
|
"cron.retry.retryOn": "Cron Retry Error Types",
|
||||||
"cron.webhook": "Cron Legacy Webhook (Deprecated)",
|
"cron.webhook": "Cron Legacy Webhook (Deprecated)",
|
||||||
"cron.webhookToken": "Cron Webhook Bearer Token",
|
"cron.webhookToken": "Cron Webhook Bearer Token",
|
||||||
"cron.sessionRetention": "Cron Session Retention",
|
"cron.sessionRetention": "Cron Session Retention",
|
||||||
|
|||||||
@@ -1,7 +1,21 @@
|
|||||||
|
/** Error types that can trigger retries for one-shot jobs. */
|
||||||
|
export type CronRetryOn = "rate_limit" | "network" | "timeout" | "server_error";
|
||||||
|
|
||||||
|
export type CronRetryConfig = {
|
||||||
|
/** Max retries for transient errors before permanent disable (default: 3). */
|
||||||
|
maxAttempts?: number;
|
||||||
|
/** Backoff delays in ms for each retry attempt (default: [30000, 60000, 300000]). */
|
||||||
|
backoffMs?: number[];
|
||||||
|
/** Error types to retry; omit to retry all transient types. */
|
||||||
|
retryOn?: CronRetryOn[];
|
||||||
|
};
|
||||||
|
|
||||||
export type CronConfig = {
|
export type CronConfig = {
|
||||||
enabled?: boolean;
|
enabled?: boolean;
|
||||||
store?: string;
|
store?: string;
|
||||||
maxConcurrentRuns?: number;
|
maxConcurrentRuns?: number;
|
||||||
|
/** Override default retry policy for one-shot jobs on transient errors. */
|
||||||
|
retry?: CronRetryConfig;
|
||||||
/**
|
/**
|
||||||
* Deprecated legacy fallback webhook URL used only for stored jobs with notify=true.
|
* Deprecated legacy fallback webhook URL used only for stored jobs with notify=true.
|
||||||
* Prefer per-job delivery.mode="webhook" with delivery.to.
|
* Prefer per-job delivery.mode="webhook" with delivery.to.
|
||||||
|
|||||||
@@ -374,6 +374,17 @@ export const OpenClawSchema = z
|
|||||||
enabled: z.boolean().optional(),
|
enabled: z.boolean().optional(),
|
||||||
store: z.string().optional(),
|
store: z.string().optional(),
|
||||||
maxConcurrentRuns: z.number().int().positive().optional(),
|
maxConcurrentRuns: z.number().int().positive().optional(),
|
||||||
|
retry: z
|
||||||
|
.object({
|
||||||
|
maxAttempts: z.number().int().min(0).max(10).optional(),
|
||||||
|
backoffMs: z.array(z.number().int().nonnegative()).min(1).max(10).optional(),
|
||||||
|
retryOn: z
|
||||||
|
.array(z.enum(["rate_limit", "network", "timeout", "server_error"]))
|
||||||
|
.min(1)
|
||||||
|
.optional(),
|
||||||
|
})
|
||||||
|
.strict()
|
||||||
|
.optional(),
|
||||||
webhook: HttpUrlSchema.optional(),
|
webhook: HttpUrlSchema.optional(),
|
||||||
webhookToken: z.string().optional().register(sensitive),
|
webhookToken: z.string().optional().register(sensitive),
|
||||||
sessionRetention: z.union([z.string(), z.literal(false)]).optional(),
|
sessionRetention: z.union([z.string(), z.literal(false)]).optional(),
|
||||||
|
|||||||
@@ -752,6 +752,224 @@ describe("Cron issue regressions", () => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("#24355: one-shot job retries on transient error, then succeeds", async () => {
|
||||||
|
const store = await makeStorePath();
|
||||||
|
const scheduledAt = Date.parse("2026-02-06T10:00:00.000Z");
|
||||||
|
|
||||||
|
const cronJob = createIsolatedRegressionJob({
|
||||||
|
id: "oneshot-retry",
|
||||||
|
name: "reminder",
|
||||||
|
scheduledAt,
|
||||||
|
schedule: { kind: "at", at: new Date(scheduledAt).toISOString() },
|
||||||
|
payload: { kind: "agentTurn", message: "remind me" },
|
||||||
|
state: { nextRunAtMs: scheduledAt },
|
||||||
|
});
|
||||||
|
cronJob.deleteAfterRun = false;
|
||||||
|
await writeCronJobs(store.storePath, [cronJob]);
|
||||||
|
|
||||||
|
let now = scheduledAt;
|
||||||
|
const runIsolatedAgentJob = vi
|
||||||
|
.fn()
|
||||||
|
.mockResolvedValueOnce({ status: "error", error: "429 rate limit exceeded" })
|
||||||
|
.mockResolvedValueOnce({ status: "ok", summary: "done" });
|
||||||
|
const state = createCronServiceState({
|
||||||
|
cronEnabled: true,
|
||||||
|
storePath: store.storePath,
|
||||||
|
log: noopLogger,
|
||||||
|
nowMs: () => now,
|
||||||
|
enqueueSystemEvent: vi.fn(),
|
||||||
|
requestHeartbeatNow: vi.fn(),
|
||||||
|
runIsolatedAgentJob,
|
||||||
|
});
|
||||||
|
|
||||||
|
await onTimer(state);
|
||||||
|
let job = state.store?.jobs.find((j) => j.id === "oneshot-retry");
|
||||||
|
expect(job).toBeDefined();
|
||||||
|
expect(job!.enabled).toBe(true);
|
||||||
|
expect(job!.state.lastStatus).toBe("error");
|
||||||
|
expect(job!.state.nextRunAtMs).toBeDefined();
|
||||||
|
expect(job!.state.nextRunAtMs).toBeGreaterThan(scheduledAt);
|
||||||
|
|
||||||
|
now = (job!.state.nextRunAtMs ?? 0) + 1;
|
||||||
|
await onTimer(state);
|
||||||
|
job = state.store?.jobs.find((j) => j.id === "oneshot-retry");
|
||||||
|
expect(job).toBeDefined();
|
||||||
|
expect(job!.state.lastStatus).toBe("ok");
|
||||||
|
expect(runIsolatedAgentJob).toHaveBeenCalledTimes(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("#24355: one-shot job disabled after max transient retries", async () => {
|
||||||
|
const store = await makeStorePath();
|
||||||
|
const scheduledAt = Date.parse("2026-02-06T10:00:00.000Z");
|
||||||
|
|
||||||
|
const cronJob = createIsolatedRegressionJob({
|
||||||
|
id: "oneshot-max-retries",
|
||||||
|
name: "reminder",
|
||||||
|
scheduledAt,
|
||||||
|
schedule: { kind: "at", at: new Date(scheduledAt).toISOString() },
|
||||||
|
payload: { kind: "agentTurn", message: "remind me" },
|
||||||
|
state: { nextRunAtMs: scheduledAt },
|
||||||
|
});
|
||||||
|
await writeCronJobs(store.storePath, [cronJob]);
|
||||||
|
|
||||||
|
let now = scheduledAt;
|
||||||
|
const runIsolatedAgentJob = vi.fn().mockResolvedValue({
|
||||||
|
status: "error",
|
||||||
|
error: "429 rate limit exceeded",
|
||||||
|
});
|
||||||
|
const state = createCronServiceState({
|
||||||
|
cronEnabled: true,
|
||||||
|
storePath: store.storePath,
|
||||||
|
log: noopLogger,
|
||||||
|
nowMs: () => now,
|
||||||
|
enqueueSystemEvent: vi.fn(),
|
||||||
|
requestHeartbeatNow: vi.fn(),
|
||||||
|
runIsolatedAgentJob,
|
||||||
|
});
|
||||||
|
|
||||||
|
for (let i = 0; i < 4; i++) {
|
||||||
|
await onTimer(state);
|
||||||
|
const job = state.store?.jobs.find((j) => j.id === "oneshot-max-retries");
|
||||||
|
expect(job).toBeDefined();
|
||||||
|
if (i < 3) {
|
||||||
|
expect(job!.enabled).toBe(true);
|
||||||
|
now = (job!.state.nextRunAtMs ?? now) + 1;
|
||||||
|
} else {
|
||||||
|
expect(job!.enabled).toBe(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(runIsolatedAgentJob).toHaveBeenCalledTimes(4);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("#24355: one-shot job respects cron.retry config", async () => {
|
||||||
|
const store = await makeStorePath();
|
||||||
|
const scheduledAt = Date.parse("2026-02-06T10:00:00.000Z");
|
||||||
|
|
||||||
|
const cronJob = createIsolatedRegressionJob({
|
||||||
|
id: "oneshot-custom-retry",
|
||||||
|
name: "reminder",
|
||||||
|
scheduledAt,
|
||||||
|
schedule: { kind: "at", at: new Date(scheduledAt).toISOString() },
|
||||||
|
payload: { kind: "agentTurn", message: "remind me" },
|
||||||
|
state: { nextRunAtMs: scheduledAt },
|
||||||
|
});
|
||||||
|
await writeCronJobs(store.storePath, [cronJob]);
|
||||||
|
|
||||||
|
let now = scheduledAt;
|
||||||
|
const runIsolatedAgentJob = vi.fn().mockResolvedValue({
|
||||||
|
status: "error",
|
||||||
|
error: "429 rate limit exceeded",
|
||||||
|
});
|
||||||
|
const state = createCronServiceState({
|
||||||
|
cronEnabled: true,
|
||||||
|
storePath: store.storePath,
|
||||||
|
log: noopLogger,
|
||||||
|
nowMs: () => now,
|
||||||
|
enqueueSystemEvent: vi.fn(),
|
||||||
|
requestHeartbeatNow: vi.fn(),
|
||||||
|
runIsolatedAgentJob,
|
||||||
|
cronConfig: {
|
||||||
|
retry: { maxAttempts: 2, backoffMs: [1000, 2000] },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
for (let i = 0; i < 4; i++) {
|
||||||
|
await onTimer(state);
|
||||||
|
const job = state.store?.jobs.find((j) => j.id === "oneshot-custom-retry");
|
||||||
|
expect(job).toBeDefined();
|
||||||
|
if (i < 2) {
|
||||||
|
expect(job!.enabled).toBe(true);
|
||||||
|
now = (job!.state.nextRunAtMs ?? now) + 1;
|
||||||
|
} else {
|
||||||
|
expect(job!.enabled).toBe(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
expect(runIsolatedAgentJob).toHaveBeenCalledTimes(3);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("#24355: one-shot job disabled immediately on permanent error", async () => {
|
||||||
|
const store = await makeStorePath();
|
||||||
|
const scheduledAt = Date.parse("2026-02-06T10:00:00.000Z");
|
||||||
|
|
||||||
|
const cronJob = createIsolatedRegressionJob({
|
||||||
|
id: "oneshot-permanent-error",
|
||||||
|
name: "reminder",
|
||||||
|
scheduledAt,
|
||||||
|
schedule: { kind: "at", at: new Date(scheduledAt).toISOString() },
|
||||||
|
payload: { kind: "agentTurn", message: "remind me" },
|
||||||
|
state: { nextRunAtMs: scheduledAt },
|
||||||
|
});
|
||||||
|
await writeCronJobs(store.storePath, [cronJob]);
|
||||||
|
|
||||||
|
let now = scheduledAt;
|
||||||
|
const state = createCronServiceState({
|
||||||
|
cronEnabled: true,
|
||||||
|
storePath: store.storePath,
|
||||||
|
log: noopLogger,
|
||||||
|
nowMs: () => now,
|
||||||
|
enqueueSystemEvent: vi.fn(),
|
||||||
|
requestHeartbeatNow: vi.fn(),
|
||||||
|
runIsolatedAgentJob: vi.fn().mockResolvedValue({
|
||||||
|
status: "error",
|
||||||
|
error: "invalid API key",
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
await onTimer(state);
|
||||||
|
|
||||||
|
const job = state.store?.jobs.find((j) => j.id === "oneshot-permanent-error");
|
||||||
|
expect(job).toBeDefined();
|
||||||
|
expect(job!.enabled).toBe(false);
|
||||||
|
expect(job!.state.lastStatus).toBe("error");
|
||||||
|
expect(job!.state.nextRunAtMs).toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("#24355: deleteAfterRun:true one-shot job is deleted after successful retry", async () => {
|
||||||
|
const store = await makeStorePath();
|
||||||
|
const scheduledAt = Date.parse("2026-02-06T10:00:00.000Z");
|
||||||
|
|
||||||
|
const cronJob = createIsolatedRegressionJob({
|
||||||
|
id: "oneshot-deleteAfterRun-retry",
|
||||||
|
name: "reminder",
|
||||||
|
scheduledAt,
|
||||||
|
schedule: { kind: "at", at: new Date(scheduledAt).toISOString() },
|
||||||
|
payload: { kind: "agentTurn", message: "remind me" },
|
||||||
|
state: { nextRunAtMs: scheduledAt },
|
||||||
|
});
|
||||||
|
cronJob.deleteAfterRun = true;
|
||||||
|
await writeCronJobs(store.storePath, [cronJob]);
|
||||||
|
|
||||||
|
let now = scheduledAt;
|
||||||
|
const runIsolatedAgentJob = vi
|
||||||
|
.fn()
|
||||||
|
.mockResolvedValueOnce({ status: "error", error: "429 rate limit exceeded" })
|
||||||
|
.mockResolvedValueOnce({ status: "ok", summary: "done" });
|
||||||
|
const state = createCronServiceState({
|
||||||
|
cronEnabled: true,
|
||||||
|
storePath: store.storePath,
|
||||||
|
log: noopLogger,
|
||||||
|
nowMs: () => now,
|
||||||
|
enqueueSystemEvent: vi.fn(),
|
||||||
|
requestHeartbeatNow: vi.fn(),
|
||||||
|
runIsolatedAgentJob,
|
||||||
|
});
|
||||||
|
|
||||||
|
// First run: transient error → retry scheduled, job still in store.
|
||||||
|
await onTimer(state);
|
||||||
|
let job = state.store?.jobs.find((j) => j.id === "oneshot-deleteAfterRun-retry");
|
||||||
|
expect(job).toBeDefined();
|
||||||
|
expect(job!.enabled).toBe(true);
|
||||||
|
expect(job!.state.lastStatus).toBe("error");
|
||||||
|
expect(job!.state.nextRunAtMs).toBeGreaterThan(scheduledAt);
|
||||||
|
|
||||||
|
// Second run: success → deleteAfterRun removes the job from the store.
|
||||||
|
now = (job!.state.nextRunAtMs ?? 0) + 1;
|
||||||
|
await onTimer(state);
|
||||||
|
const deleted = state.store?.jobs.find((j) => j.id === "oneshot-deleteAfterRun-retry");
|
||||||
|
expect(deleted).toBeUndefined();
|
||||||
|
expect(runIsolatedAgentJob).toHaveBeenCalledTimes(2);
|
||||||
|
});
|
||||||
|
|
||||||
it("prevents spin loop when cron job completes within the scheduled second (#17821)", async () => {
|
it("prevents spin loop when cron job completes within the scheduled second (#17821)", async () => {
|
||||||
const store = await makeStorePath();
|
const store = await makeStorePath();
|
||||||
// Simulate a cron job "0 13 * * *" (daily 13:00 UTC) that fires exactly
|
// Simulate a cron job "0 13 * * *" (daily 13:00 UTC) that fires exactly
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import type { CronConfig, CronRetryOn } from "../../config/types.cron.js";
|
||||||
import type { HeartbeatRunResult } from "../../infra/heartbeat-wake.js";
|
import type { HeartbeatRunResult } from "../../infra/heartbeat-wake.js";
|
||||||
import { DEFAULT_AGENT_ID } from "../../routing/session-key.js";
|
import { DEFAULT_AGENT_ID } from "../../routing/session-key.js";
|
||||||
import { resolveCronDeliveryPlan } from "../delivery.js";
|
import { resolveCronDeliveryPlan } from "../delivery.js";
|
||||||
@@ -91,7 +92,7 @@ function isAbortError(err: unknown): boolean {
|
|||||||
* Exponential backoff delays (in ms) indexed by consecutive error count.
|
* Exponential backoff delays (in ms) indexed by consecutive error count.
|
||||||
* After the last entry the delay stays constant.
|
* After the last entry the delay stays constant.
|
||||||
*/
|
*/
|
||||||
const ERROR_BACKOFF_SCHEDULE_MS = [
|
const DEFAULT_BACKOFF_SCHEDULE_MS = [
|
||||||
30_000, // 1st error → 30 s
|
30_000, // 1st error → 30 s
|
||||||
60_000, // 2nd error → 1 min
|
60_000, // 2nd error → 1 min
|
||||||
5 * 60_000, // 3rd error → 5 min
|
5 * 60_000, // 3rd error → 5 min
|
||||||
@@ -99,9 +100,43 @@ const ERROR_BACKOFF_SCHEDULE_MS = [
|
|||||||
60 * 60_000, // 5th+ error → 60 min
|
60 * 60_000, // 5th+ error → 60 min
|
||||||
];
|
];
|
||||||
|
|
||||||
function errorBackoffMs(consecutiveErrors: number): number {
|
function errorBackoffMs(
|
||||||
const idx = Math.min(consecutiveErrors - 1, ERROR_BACKOFF_SCHEDULE_MS.length - 1);
|
consecutiveErrors: number,
|
||||||
return ERROR_BACKOFF_SCHEDULE_MS[Math.max(0, idx)];
|
scheduleMs = DEFAULT_BACKOFF_SCHEDULE_MS,
|
||||||
|
): number {
|
||||||
|
const idx = Math.min(consecutiveErrors - 1, scheduleMs.length - 1);
|
||||||
|
return scheduleMs[Math.max(0, idx)];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Default max retries for one-shot jobs on transient errors (#24355). */
|
||||||
|
const DEFAULT_MAX_TRANSIENT_RETRIES = 3;
|
||||||
|
|
||||||
|
const TRANSIENT_PATTERNS: Record<string, RegExp> = {
|
||||||
|
rate_limit: /(rate[_ ]limit|too many requests|429|resource has been exhausted|cloudflare)/i,
|
||||||
|
network: /(network|econnreset|econnrefused|fetch failed|socket)/i,
|
||||||
|
timeout: /(timeout|etimedout)/i,
|
||||||
|
server_error: /\b5\d{2}\b/,
|
||||||
|
};
|
||||||
|
|
||||||
|
function isTransientCronError(error: string | undefined, retryOn?: CronRetryOn[]): boolean {
|
||||||
|
if (!error || typeof error !== "string") {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const keys = retryOn?.length ? retryOn : (Object.keys(TRANSIENT_PATTERNS) as CronRetryOn[]);
|
||||||
|
return keys.some((k) => TRANSIENT_PATTERNS[k]?.test(error));
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveRetryConfig(cronConfig?: CronConfig) {
|
||||||
|
const retry = cronConfig?.retry;
|
||||||
|
return {
|
||||||
|
maxAttempts:
|
||||||
|
typeof retry?.maxAttempts === "number" ? retry.maxAttempts : DEFAULT_MAX_TRANSIENT_RETRIES,
|
||||||
|
backoffMs:
|
||||||
|
Array.isArray(retry?.backoffMs) && retry.backoffMs.length > 0
|
||||||
|
? retry.backoffMs
|
||||||
|
: DEFAULT_BACKOFF_SCHEDULE_MS.slice(0, 3),
|
||||||
|
retryOn: Array.isArray(retry?.retryOn) && retry.retryOn.length > 0 ? retry.retryOn : undefined,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function resolveDeliveryStatus(params: { job: CronJob; delivered?: boolean }): CronDeliveryStatus {
|
function resolveDeliveryStatus(params: { job: CronJob; delivered?: boolean }): CronDeliveryStatus {
|
||||||
@@ -155,21 +190,47 @@ export function applyJobResult(
|
|||||||
|
|
||||||
if (!shouldDelete) {
|
if (!shouldDelete) {
|
||||||
if (job.schedule.kind === "at") {
|
if (job.schedule.kind === "at") {
|
||||||
// One-shot jobs are always disabled after ANY terminal status
|
if (result.status === "ok" || result.status === "skipped") {
|
||||||
// (ok, error, or skipped). This prevents tight-loop rescheduling
|
// One-shot done or skipped: disable to prevent tight-loop (#11452).
|
||||||
// when computeJobNextRunAtMs returns the past atMs value (#11452).
|
job.enabled = false;
|
||||||
job.enabled = false;
|
job.state.nextRunAtMs = undefined;
|
||||||
job.state.nextRunAtMs = undefined;
|
} else if (result.status === "error") {
|
||||||
if (result.status === "error") {
|
const retryConfig = resolveRetryConfig(state.deps.cronConfig);
|
||||||
state.deps.log.warn(
|
const transient = isTransientCronError(result.error, retryConfig.retryOn);
|
||||||
{
|
// consecutiveErrors is always set to ≥1 by the increment block above.
|
||||||
jobId: job.id,
|
const consecutive = job.state.consecutiveErrors;
|
||||||
jobName: job.name,
|
if (transient && consecutive <= retryConfig.maxAttempts) {
|
||||||
consecutiveErrors: job.state.consecutiveErrors,
|
// Schedule retry with backoff (#24355).
|
||||||
error: result.error,
|
const backoff = errorBackoffMs(consecutive, retryConfig.backoffMs);
|
||||||
},
|
job.state.nextRunAtMs = result.endedAt + backoff;
|
||||||
"cron: disabling one-shot job after error",
|
state.deps.log.info(
|
||||||
);
|
{
|
||||||
|
jobId: job.id,
|
||||||
|
jobName: job.name,
|
||||||
|
consecutiveErrors: consecutive,
|
||||||
|
backoffMs: backoff,
|
||||||
|
nextRunAtMs: job.state.nextRunAtMs,
|
||||||
|
},
|
||||||
|
"cron: scheduling one-shot retry after transient error",
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
// Permanent error or max retries exhausted: disable.
|
||||||
|
// Note: deleteAfterRun:true only triggers on ok (see shouldDelete above),
|
||||||
|
// so exhausted-retry jobs are disabled but intentionally kept in the store
|
||||||
|
// to preserve the error state for inspection.
|
||||||
|
job.enabled = false;
|
||||||
|
job.state.nextRunAtMs = undefined;
|
||||||
|
state.deps.log.warn(
|
||||||
|
{
|
||||||
|
jobId: job.id,
|
||||||
|
jobName: job.name,
|
||||||
|
consecutiveErrors: consecutive,
|
||||||
|
error: result.error,
|
||||||
|
reason: transient ? "max retries exhausted" : "permanent error",
|
||||||
|
},
|
||||||
|
"cron: disabling one-shot job after error",
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if (result.status === "error" && job.enabled) {
|
} else if (result.status === "error" && job.enabled) {
|
||||||
// Apply exponential backoff for errored jobs to prevent retry storms.
|
// Apply exponential backoff for errored jobs to prevent retry storms.
|
||||||
@@ -474,9 +535,20 @@ function isRunnableJob(params: {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (params.skipAtIfAlreadyRan && job.schedule.kind === "at" && job.state.lastStatus) {
|
if (params.skipAtIfAlreadyRan && job.schedule.kind === "at" && job.state.lastStatus) {
|
||||||
// Any terminal status (ok, error, skipped) means the job already ran at least once.
|
// One-shot with terminal status: skip unless it's a transient-error retry.
|
||||||
// Don't re-fire it on restart — applyJobResult disables one-shot jobs, but guard
|
// Retries have nextRunAtMs > lastRunAtMs (scheduled after the failed run) (#24355).
|
||||||
// here defensively (#13845).
|
// ok/skipped or error-without-retry always skip (#13845).
|
||||||
|
const lastRun = job.state.lastRunAtMs;
|
||||||
|
const nextRun = job.state.nextRunAtMs;
|
||||||
|
if (
|
||||||
|
job.state.lastStatus === "error" &&
|
||||||
|
job.enabled &&
|
||||||
|
typeof nextRun === "number" &&
|
||||||
|
typeof lastRun === "number" &&
|
||||||
|
nextRun > lastRun
|
||||||
|
) {
|
||||||
|
return nowMs >= nextRun;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const next = job.state.nextRunAtMs;
|
const next = job.state.nextRunAtMs;
|
||||||
|
|||||||
Reference in New Issue
Block a user