From 4030de6c7307996a88c18158f76a618721128f0c Mon Sep 17 00:00:00 2001 From: scoootscooob <167050519+scoootscooob@users.noreply.github.com> Date: Mon, 2 Mar 2026 10:38:59 -0800 Subject: [PATCH] fix(cron): move session reaper to finally block so it runs reliably (#31996) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(cron): move session reaper to finally block so it runs reliably The cron session reaper was placed inside the try block of onTimer(), after job execution and state updates. If the locked persist section threw, the reaper was skipped — causing isolated cron run sessions to accumulate indefinitely in sessions.json. Move the reaper into the finally block so it always executes after a timer tick, regardless of whether job execution succeeded. The reaper is already self-throttled (MIN_SWEEP_INTERVAL_MS = 5 min) so calling it more reliably has no performance impact. Closes #31946 Co-Authored-By: Claude Opus 4.6 * fix: strengthen cron reaper failure-path coverage and changelog (#31996) (thanks @scoootscooob) --------- Co-authored-by: Claude Opus 4.6 Co-authored-by: Peter Steinberger --- CHANGELOG.md | 1 + .../service.session-reaper-in-finally.test.ts | 165 ++++++++++++++++++ src/cron/service/timer.ts | 6 +- 3 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 src/cron/service.session-reaper-in-finally.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 33fe7c58fb9..154f00d2e95 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,6 +54,7 @@ Docs: https://docs.openclaw.ai - Browser/default profile selection: default `browser.defaultProfile` behavior now prefers `openclaw` (managed standalone CDP) when no explicit default is configured, while still auto-provisioning the `chrome` relay profile for explicit opt-in use. (#32031) Fixes #31907. Thanks @liuxiaopai-ai. - Doctor/local memory provider checks: stop false-positive local-provider warnings when `provider=local` and no explicit `modelPath` is set by honoring default local model fallback while still warning when gateway probe reports local embeddings not ready. (#32014) Fixes #31998. Thanks @adhishthite. - Feishu/Run channel fallback: prefer `Provider` over `Surface` when inferring queued run `messageProvider` fallback (when `OriginatingChannel` is missing), preventing Feishu turns from being mislabeled as `webchat` in mixed relay metadata contexts. (#31880) Fixes #31859. Thanks @liuxiaopai-ai. +- Cron/session reaper reliability: move cron session reaper sweeps into `onTimer` `finally` and keep pruning active even when timer ticks fail early (for example cron store parse failures), preventing stale isolated run sessions from accumulating indefinitely. (#31996) Fixes #31946. Thanks @scoootscooob. - Sandbox/Docker setup command parsing: accept `agents.*.sandbox.docker.setupCommand` as either a string or a string array, and normalize arrays to newline-delimited shell scripts so multi-step setup commands no longer concatenate without separators. (#31953) Thanks @liuxiaopai-ai. - Gateway/Plugin HTTP route precedence: run explicit plugin HTTP routes before the Control UI SPA catch-all so registered plugin webhook/custom paths remain reachable, while unmatched paths still fall through to Control UI handling. (#31885) Thanks @Sid-Qin. - macOS/LaunchAgent security defaults: write `Umask=63` (octal `077`) into generated gateway launchd plists so post-update service reinstalls keep owner-only file permissions by default instead of falling back to system `022`. (#32022) Fixes #31905. Thanks @liuxiaopai-ai. diff --git a/src/cron/service.session-reaper-in-finally.test.ts b/src/cron/service.session-reaper-in-finally.test.ts new file mode 100644 index 00000000000..f590b330d44 --- /dev/null +++ b/src/cron/service.session-reaper-in-finally.test.ts @@ -0,0 +1,165 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { createNoopLogger, createCronStoreHarness } from "./service.test-harness.js"; +import { createCronServiceState } from "./service/state.js"; +import { onTimer } from "./service/timer.js"; +import { resetReaperThrottle } from "./session-reaper.js"; +import type { CronJob } from "./types.js"; + +const noopLogger = createNoopLogger(); +const { makeStorePath } = createCronStoreHarness({ + prefix: "openclaw-cron-reaper-finally-", +}); + +function createDueIsolatedJob(params: { id: string; nowMs: number }): CronJob { + return { + id: params.id, + name: params.id, + enabled: true, + deleteAfterRun: false, + createdAtMs: params.nowMs, + updatedAtMs: params.nowMs, + schedule: { kind: "every", everyMs: 60_000 }, + sessionTarget: "isolated", + wakeMode: "next-heartbeat", + payload: { kind: "agentTurn", message: "test" }, + delivery: { mode: "none" }, + state: { nextRunAtMs: params.nowMs }, + }; +} + +describe("CronService - session reaper runs in finally block (#31946)", () => { + beforeEach(() => { + noopLogger.debug.mockClear(); + noopLogger.info.mockClear(); + noopLogger.warn.mockClear(); + noopLogger.error.mockClear(); + resetReaperThrottle(); + }); + + afterEach(() => { + vi.clearAllMocks(); + }); + + it("session reaper runs even when job execution throws", async () => { + const store = await makeStorePath(); + const now = Date.parse("2026-02-10T10:00:00.000Z"); + + // Write a store with a due job that will trigger execution. + await fs.mkdir(path.dirname(store.storePath), { recursive: true }); + await fs.writeFile( + store.storePath, + JSON.stringify({ + version: 1, + jobs: [createDueIsolatedJob({ id: "failing-job", nowMs: now })], + }), + "utf-8", + ); + + // Create a mock sessionStorePath to track if the reaper is called. + const sessionStorePath = path.join(path.dirname(store.storePath), "sessions", "sessions.json"); + + const state = createCronServiceState({ + storePath: store.storePath, + cronEnabled: true, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + // This will throw, simulating a failure during job execution. + runIsolatedAgentJob: vi.fn().mockRejectedValue(new Error("gateway down")), + sessionStorePath, + }); + + await onTimer(state); + + // After onTimer finishes (even with a job error), state.running must be + // false — proving the finally block executed. + expect(state.running).toBe(false); + + // The timer must be re-armed. + expect(state.timer).not.toBeNull(); + }); + + it("session reaper runs when resolveSessionStorePath is provided", async () => { + const store = await makeStorePath(); + const now = Date.parse("2026-02-10T10:00:00.000Z"); + + await fs.mkdir(path.dirname(store.storePath), { recursive: true }); + await fs.writeFile( + store.storePath, + JSON.stringify({ + version: 1, + jobs: [createDueIsolatedJob({ id: "ok-job", nowMs: now })], + }), + "utf-8", + ); + + const resolvedPaths: string[] = []; + const state = createCronServiceState({ + storePath: store.storePath, + cronEnabled: true, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok", summary: "done" }), + resolveSessionStorePath: (agentId) => { + const p = path.join(path.dirname(store.storePath), `${agentId}-sessions`, "sessions.json"); + resolvedPaths.push(p); + return p; + }, + }); + + await onTimer(state); + + // The resolveSessionStorePath callback should have been invoked to build + // the set of store paths for the session reaper. + expect(resolvedPaths.length).toBeGreaterThan(0); + expect(state.running).toBe(false); + }); + + it("prunes expired cron-run sessions even when cron store load throws", async () => { + const store = await makeStorePath(); + const now = Date.parse("2026-02-10T10:00:00.000Z"); + const sessionStorePath = path.join(path.dirname(store.storePath), "sessions", "sessions.json"); + + // Force onTimer's try-block to throw before normal execution flow. + await fs.mkdir(path.dirname(store.storePath), { recursive: true }); + await fs.writeFile(store.storePath, "{invalid-json", "utf-8"); + + // Seed an expired cron-run session entry that should be pruned by the reaper. + await fs.mkdir(path.dirname(sessionStorePath), { recursive: true }); + await fs.writeFile( + sessionStorePath, + JSON.stringify({ + "agent:agent-default:cron:failing-job:run:stale": { + sessionId: "session-stale", + updatedAt: now - 3 * 24 * 3_600_000, + }, + }), + "utf-8", + ); + + const state = createCronServiceState({ + storePath: store.storePath, + cronEnabled: true, + log: noopLogger, + nowMs: () => now, + enqueueSystemEvent: vi.fn(), + requestHeartbeatNow: vi.fn(), + runIsolatedAgentJob: vi.fn(), + sessionStorePath, + }); + + await expect(onTimer(state)).rejects.toThrow("Failed to parse cron store"); + + const updatedSessionStore = JSON.parse(await fs.readFile(sessionStorePath, "utf-8")) as Record< + string, + unknown + >; + expect(updatedSessionStore).toEqual({}); + expect(state.running).toBe(false); + }); +}); diff --git a/src/cron/service/timer.ts b/src/cron/service/timer.ts index 3190ccbb45b..9d5e2e8becb 100644 --- a/src/cron/service/timer.ts +++ b/src/cron/service/timer.ts @@ -643,7 +643,11 @@ export async function onTimer(state: CronServiceState) { await persist(state); }); } + } finally { // Piggyback session reaper on timer tick (self-throttled to every 5 min). + // Placed in `finally` so the reaper runs even when a long-running job keeps + // `state.running` true across multiple timer ticks — the early return at the + // top of onTimer would otherwise skip the reaper indefinitely. const storePaths = new Set(); if (state.deps.resolveSessionStorePath) { const defaultAgentId = state.deps.defaultAgentId ?? DEFAULT_AGENT_ID; @@ -675,7 +679,7 @@ export async function onTimer(state: CronServiceState) { } } } - } finally { + state.running = false; armTimer(state); }