mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-09 22:54:33 +00:00
fix: unify session maintenance and cron run pruning (#13083)
* fix: prune stale session entries, cap entry count, and rotate sessions.json
The sessions.json file grows unbounded over time. Every heartbeat tick (default: 30m)
triggers multiple full rewrites, and session keys from groups, threads, and DMs
accumulate indefinitely with large embedded objects (skillsSnapshot,
systemPromptReport). At >50MB the synchronous JSON parse blocks the event loop,
causing Telegram webhook timeouts and effectively taking the bot down.
Three mitigations, all running inside saveSessionStoreUnlocked() on every write:
1. Prune stale entries: remove entries with updatedAt older than 30 days
(configurable via session.maintenance.pruneDays in openclaw.json)
2. Cap entry count: keep only the 500 most recently updated entries
(configurable via session.maintenance.maxEntries). Entries without updatedAt
are evicted first.
3. File rotation: if the existing sessions.json exceeds 10MB before a write,
rename it to sessions.json.bak.{timestamp} and keep only the 3 most recent
backups (configurable via session.maintenance.rotateBytes).
All three thresholds are configurable under session.maintenance in openclaw.json
with Zod validation. No env vars.
Existing tests updated to use Date.now() instead of epoch-relative timestamps
(1, 2, 3) that would be incorrectly pruned as stale.
27 new tests covering pruning, capping, rotation, and integration scenarios.
* feat: auto-prune expired cron run sessions (#12289)
Add TTL-based reaper for isolated cron run sessions that accumulate
indefinitely in sessions.json.
New config option:
cron.sessionRetention: string | false (default: '24h')
The reaper runs piggy-backed on the cron timer tick, self-throttled
to sweep at most every 5 minutes. It removes session entries matching
the pattern cron:<jobId>:run:<uuid> whose updatedAt + retention < now.
Design follows the Kubernetes ttlSecondsAfterFinished pattern:
- Sessions are persisted normally (observability/debugging)
- A periodic reaper prunes expired entries
- Configurable retention with sensible default
- Set to false to disable pruning entirely
Files changed:
- src/config/types.cron.ts: Add sessionRetention to CronConfig
- src/config/zod-schema.ts: Add Zod validation for sessionRetention
- src/cron/session-reaper.ts: New reaper module (sweepCronRunSessions)
- src/cron/session-reaper.test.ts: 12 tests covering all paths
- src/cron/service/state.ts: Add cronConfig/sessionStorePath to deps
- src/cron/service/timer.ts: Wire reaper into onTimer tick
- src/gateway/server-cron.ts: Pass config and session store path to deps
Closes #12289
* fix: sweep cron session stores per agent
* docs: add changelog for session maintenance (#13083) (thanks @skyfallsin, @Glucksberg)
* fix: add warn-only session maintenance mode
* fix: warn-only maintenance defaults to active session
* fix: deliver maintenance warnings to active session
* docs: add session maintenance examples
* fix: accept duration and size maintenance thresholds
* refactor: share cron run session key check
* fix: format issues and replace defaultRuntime.warn with console.warn
---------
Co-authored-by: Pradeep Elankumaran <pradeepe@gmail.com>
Co-authored-by: Glucksberg <markuscontasul@gmail.com>
Co-authored-by: max <40643627+quotentiroler@users.noreply.github.com>
Co-authored-by: quotentiroler <max.nussbaumer@maxhealth.tech>
This commit is contained in:
committed by
GitHub
parent
0657d7c772
commit
e19a23520c
@@ -3,6 +3,10 @@ import crypto from "node:crypto";
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import type { MsgContext } from "../../auto-reply/templating.js";
|
||||
import type { SessionMaintenanceConfig, SessionMaintenanceMode } from "../types.base.js";
|
||||
import { parseByteSize } from "../../cli/parse-bytes.js";
|
||||
import { parseDurationMs } from "../../cli/parse-duration.js";
|
||||
import { createSubsystemLogger } from "../../logging/subsystem.js";
|
||||
import {
|
||||
deliveryContextFromSession,
|
||||
mergeDeliveryContext,
|
||||
@@ -11,9 +15,12 @@ import {
|
||||
type DeliveryContext,
|
||||
} from "../../utils/delivery-context.js";
|
||||
import { getFileMtimeMs, isCacheEnabled, resolveCacheTtlMs } from "../cache-utils.js";
|
||||
import { loadConfig } from "../config.js";
|
||||
import { deriveSessionMetaPatch } from "./metadata.js";
|
||||
import { mergeSessionEntry, type SessionEntry } from "./types.js";
|
||||
|
||||
const log = createSubsystemLogger("sessions/store");
|
||||
|
||||
// ============================================================================
|
||||
// Session Store Cache with TTL Support
|
||||
// ============================================================================
|
||||
@@ -195,15 +202,300 @@ export function readSessionUpdatedAt(params: {
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Session Store Pruning, Capping & File Rotation
|
||||
// ============================================================================
|
||||
|
||||
const DEFAULT_SESSION_PRUNE_AFTER_MS = 30 * 24 * 60 * 60 * 1000;
|
||||
const DEFAULT_SESSION_MAX_ENTRIES = 500;
|
||||
const DEFAULT_SESSION_ROTATE_BYTES = 10_485_760; // 10 MB
|
||||
const DEFAULT_SESSION_MAINTENANCE_MODE: SessionMaintenanceMode = "warn";
|
||||
|
||||
export type SessionMaintenanceWarning = {
|
||||
activeSessionKey: string;
|
||||
activeUpdatedAt?: number;
|
||||
totalEntries: number;
|
||||
pruneAfterMs: number;
|
||||
maxEntries: number;
|
||||
wouldPrune: boolean;
|
||||
wouldCap: boolean;
|
||||
};
|
||||
|
||||
type ResolvedSessionMaintenanceConfig = {
|
||||
mode: SessionMaintenanceMode;
|
||||
pruneAfterMs: number;
|
||||
maxEntries: number;
|
||||
rotateBytes: number;
|
||||
};
|
||||
|
||||
function resolvePruneAfterMs(maintenance?: SessionMaintenanceConfig): number {
|
||||
const raw = maintenance?.pruneAfter ?? maintenance?.pruneDays;
|
||||
if (raw === undefined || raw === null || raw === "") {
|
||||
return DEFAULT_SESSION_PRUNE_AFTER_MS;
|
||||
}
|
||||
try {
|
||||
return parseDurationMs(String(raw).trim(), { defaultUnit: "d" });
|
||||
} catch {
|
||||
return DEFAULT_SESSION_PRUNE_AFTER_MS;
|
||||
}
|
||||
}
|
||||
|
||||
function resolveRotateBytes(maintenance?: SessionMaintenanceConfig): number {
|
||||
const raw = maintenance?.rotateBytes;
|
||||
if (raw === undefined || raw === null || raw === "") {
|
||||
return DEFAULT_SESSION_ROTATE_BYTES;
|
||||
}
|
||||
try {
|
||||
return parseByteSize(String(raw).trim(), { defaultUnit: "b" });
|
||||
} catch {
|
||||
return DEFAULT_SESSION_ROTATE_BYTES;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve maintenance settings from openclaw.json (`session.maintenance`).
|
||||
* Falls back to built-in defaults when config is missing or unset.
|
||||
*/
|
||||
export function resolveMaintenanceConfig(): ResolvedSessionMaintenanceConfig {
|
||||
let maintenance: SessionMaintenanceConfig | undefined;
|
||||
try {
|
||||
maintenance = loadConfig().session?.maintenance;
|
||||
} catch {
|
||||
// Config may not be available (e.g. in tests). Use defaults.
|
||||
}
|
||||
return {
|
||||
mode: maintenance?.mode ?? DEFAULT_SESSION_MAINTENANCE_MODE,
|
||||
pruneAfterMs: resolvePruneAfterMs(maintenance),
|
||||
maxEntries: maintenance?.maxEntries ?? DEFAULT_SESSION_MAX_ENTRIES,
|
||||
rotateBytes: resolveRotateBytes(maintenance),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove entries whose `updatedAt` is older than the configured threshold.
|
||||
* Entries without `updatedAt` are kept (cannot determine staleness).
|
||||
* Mutates `store` in-place.
|
||||
*/
|
||||
export function pruneStaleEntries(
|
||||
store: Record<string, SessionEntry>,
|
||||
overrideMaxAgeMs?: number,
|
||||
opts: { log?: boolean } = {},
|
||||
): number {
|
||||
const maxAgeMs = overrideMaxAgeMs ?? resolveMaintenanceConfig().pruneAfterMs;
|
||||
const cutoffMs = Date.now() - maxAgeMs;
|
||||
let pruned = 0;
|
||||
for (const [key, entry] of Object.entries(store)) {
|
||||
if (entry?.updatedAt != null && entry.updatedAt < cutoffMs) {
|
||||
delete store[key];
|
||||
pruned++;
|
||||
}
|
||||
}
|
||||
if (pruned > 0 && opts.log !== false) {
|
||||
log.info("pruned stale session entries", { pruned, maxAgeMs });
|
||||
}
|
||||
return pruned;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cap the store to the N most recently updated entries.
|
||||
* Entries without `updatedAt` are sorted last (removed first when over limit).
|
||||
* Mutates `store` in-place.
|
||||
*/
|
||||
function getEntryUpdatedAt(entry?: SessionEntry): number {
|
||||
return entry?.updatedAt ?? Number.NEGATIVE_INFINITY;
|
||||
}
|
||||
|
||||
export function getActiveSessionMaintenanceWarning(params: {
|
||||
store: Record<string, SessionEntry>;
|
||||
activeSessionKey: string;
|
||||
pruneAfterMs: number;
|
||||
maxEntries: number;
|
||||
nowMs?: number;
|
||||
}): SessionMaintenanceWarning | null {
|
||||
const activeSessionKey = params.activeSessionKey.trim();
|
||||
if (!activeSessionKey) {
|
||||
return null;
|
||||
}
|
||||
const activeEntry = params.store[activeSessionKey];
|
||||
if (!activeEntry) {
|
||||
return null;
|
||||
}
|
||||
const now = params.nowMs ?? Date.now();
|
||||
const cutoffMs = now - params.pruneAfterMs;
|
||||
const wouldPrune = activeEntry.updatedAt != null ? activeEntry.updatedAt < cutoffMs : false;
|
||||
const keys = Object.keys(params.store);
|
||||
const wouldCap =
|
||||
keys.length > params.maxEntries &&
|
||||
keys
|
||||
.toSorted((a, b) => getEntryUpdatedAt(params.store[b]) - getEntryUpdatedAt(params.store[a]))
|
||||
.slice(params.maxEntries)
|
||||
.includes(activeSessionKey);
|
||||
|
||||
if (!wouldPrune && !wouldCap) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
activeSessionKey,
|
||||
activeUpdatedAt: activeEntry.updatedAt,
|
||||
totalEntries: keys.length,
|
||||
pruneAfterMs: params.pruneAfterMs,
|
||||
maxEntries: params.maxEntries,
|
||||
wouldPrune,
|
||||
wouldCap,
|
||||
};
|
||||
}
|
||||
|
||||
export function capEntryCount(
|
||||
store: Record<string, SessionEntry>,
|
||||
overrideMax?: number,
|
||||
opts: { log?: boolean } = {},
|
||||
): number {
|
||||
const maxEntries = overrideMax ?? resolveMaintenanceConfig().maxEntries;
|
||||
const keys = Object.keys(store);
|
||||
if (keys.length <= maxEntries) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Sort by updatedAt descending; entries without updatedAt go to the end (removed first).
|
||||
const sorted = keys.toSorted((a, b) => {
|
||||
const aTime = getEntryUpdatedAt(store[a]);
|
||||
const bTime = getEntryUpdatedAt(store[b]);
|
||||
return bTime - aTime;
|
||||
});
|
||||
|
||||
const toRemove = sorted.slice(maxEntries);
|
||||
for (const key of toRemove) {
|
||||
delete store[key];
|
||||
}
|
||||
if (opts.log !== false) {
|
||||
log.info("capped session entry count", { removed: toRemove.length, maxEntries });
|
||||
}
|
||||
return toRemove.length;
|
||||
}
|
||||
|
||||
async function getSessionFileSize(storePath: string): Promise<number | null> {
|
||||
try {
|
||||
const stat = await fs.promises.stat(storePath);
|
||||
return stat.size;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Rotate the sessions file if it exceeds the configured size threshold.
|
||||
* Renames the current file to `sessions.json.bak.{timestamp}` and cleans up
|
||||
* old rotation backups, keeping only the 3 most recent `.bak.*` files.
|
||||
*/
|
||||
export async function rotateSessionFile(
|
||||
storePath: string,
|
||||
overrideBytes?: number,
|
||||
): Promise<boolean> {
|
||||
const maxBytes = overrideBytes ?? resolveMaintenanceConfig().rotateBytes;
|
||||
|
||||
// Check current file size (file may not exist yet).
|
||||
const fileSize = await getSessionFileSize(storePath);
|
||||
if (fileSize == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (fileSize <= maxBytes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Rotate: rename current file to .bak.{timestamp}
|
||||
const backupPath = `${storePath}.bak.${Date.now()}`;
|
||||
try {
|
||||
await fs.promises.rename(storePath, backupPath);
|
||||
log.info("rotated session store file", {
|
||||
backupPath: path.basename(backupPath),
|
||||
sizeBytes: fileSize,
|
||||
});
|
||||
} catch {
|
||||
// If rename fails (e.g. file disappeared), skip rotation.
|
||||
return false;
|
||||
}
|
||||
|
||||
// Clean up old backups — keep only the 3 most recent .bak.* files.
|
||||
try {
|
||||
const dir = path.dirname(storePath);
|
||||
const baseName = path.basename(storePath);
|
||||
const files = await fs.promises.readdir(dir);
|
||||
const backups = files
|
||||
.filter((f) => f.startsWith(`${baseName}.bak.`))
|
||||
.toSorted()
|
||||
.toReversed();
|
||||
|
||||
const maxBackups = 3;
|
||||
if (backups.length > maxBackups) {
|
||||
const toDelete = backups.slice(maxBackups);
|
||||
for (const old of toDelete) {
|
||||
await fs.promises.unlink(path.join(dir, old)).catch(() => undefined);
|
||||
}
|
||||
log.info("cleaned up old session store backups", { deleted: toDelete.length });
|
||||
}
|
||||
} catch {
|
||||
// Best-effort cleanup; don't fail the write.
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
type SaveSessionStoreOptions = {
|
||||
/** Skip pruning, capping, and rotation (e.g. during one-time migrations). */
|
||||
skipMaintenance?: boolean;
|
||||
/** Active session key for warn-only maintenance. */
|
||||
activeSessionKey?: string;
|
||||
/** Optional callback for warn-only maintenance. */
|
||||
onWarn?: (warning: SessionMaintenanceWarning) => void | Promise<void>;
|
||||
};
|
||||
|
||||
async function saveSessionStoreUnlocked(
|
||||
storePath: string,
|
||||
store: Record<string, SessionEntry>,
|
||||
opts?: SaveSessionStoreOptions,
|
||||
): Promise<void> {
|
||||
// Invalidate cache on write to ensure consistency
|
||||
invalidateSessionStoreCache(storePath);
|
||||
|
||||
normalizeSessionStore(store);
|
||||
|
||||
if (!opts?.skipMaintenance) {
|
||||
// Resolve maintenance config once (avoids repeated loadConfig() calls).
|
||||
const maintenance = resolveMaintenanceConfig();
|
||||
const shouldWarnOnly = maintenance.mode === "warn";
|
||||
|
||||
if (shouldWarnOnly) {
|
||||
const activeSessionKey = opts?.activeSessionKey?.trim();
|
||||
if (activeSessionKey) {
|
||||
const warning = getActiveSessionMaintenanceWarning({
|
||||
store,
|
||||
activeSessionKey,
|
||||
pruneAfterMs: maintenance.pruneAfterMs,
|
||||
maxEntries: maintenance.maxEntries,
|
||||
});
|
||||
if (warning) {
|
||||
log.warn("session maintenance would evict active session; skipping enforcement", {
|
||||
activeSessionKey: warning.activeSessionKey,
|
||||
wouldPrune: warning.wouldPrune,
|
||||
wouldCap: warning.wouldCap,
|
||||
pruneAfterMs: warning.pruneAfterMs,
|
||||
maxEntries: warning.maxEntries,
|
||||
});
|
||||
await opts?.onWarn?.(warning);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Prune stale entries and cap total count before serializing.
|
||||
pruneStaleEntries(store, maintenance.pruneAfterMs);
|
||||
capEntryCount(store, maintenance.maxEntries);
|
||||
|
||||
// Rotate the on-disk file if it exceeds the size threshold.
|
||||
await rotateSessionFile(storePath, maintenance.rotateBytes);
|
||||
}
|
||||
}
|
||||
|
||||
await fs.promises.mkdir(path.dirname(storePath), { recursive: true });
|
||||
const json = JSON.stringify(store, null, 2);
|
||||
|
||||
@@ -266,21 +558,23 @@ async function saveSessionStoreUnlocked(
|
||||
export async function saveSessionStore(
|
||||
storePath: string,
|
||||
store: Record<string, SessionEntry>,
|
||||
opts?: SaveSessionStoreOptions,
|
||||
): Promise<void> {
|
||||
await withSessionStoreLock(storePath, async () => {
|
||||
await saveSessionStoreUnlocked(storePath, store);
|
||||
await saveSessionStoreUnlocked(storePath, store, opts);
|
||||
});
|
||||
}
|
||||
|
||||
export async function updateSessionStore<T>(
|
||||
storePath: string,
|
||||
mutator: (store: Record<string, SessionEntry>) => Promise<T> | T,
|
||||
opts?: SaveSessionStoreOptions,
|
||||
): Promise<T> {
|
||||
return await withSessionStoreLock(storePath, async () => {
|
||||
// Always re-read inside the lock to avoid clobbering concurrent writers.
|
||||
const store = loadSessionStore(storePath, { skipCache: true });
|
||||
const result = await mutator(store);
|
||||
await saveSessionStoreUnlocked(storePath, store);
|
||||
await saveSessionStoreUnlocked(storePath, store, opts);
|
||||
return result;
|
||||
});
|
||||
}
|
||||
@@ -381,7 +675,7 @@ export async function updateSessionStoreEntry(params: {
|
||||
}
|
||||
const next = mergeSessionEntry(existing, patch);
|
||||
store[sessionKey] = next;
|
||||
await saveSessionStoreUnlocked(storePath, store);
|
||||
await saveSessionStoreUnlocked(storePath, store, { activeSessionKey: sessionKey });
|
||||
return next;
|
||||
});
|
||||
}
|
||||
@@ -395,24 +689,28 @@ export async function recordSessionMetaFromInbound(params: {
|
||||
}): Promise<SessionEntry | null> {
|
||||
const { storePath, sessionKey, ctx } = params;
|
||||
const createIfMissing = params.createIfMissing ?? true;
|
||||
return await updateSessionStore(storePath, (store) => {
|
||||
const existing = store[sessionKey];
|
||||
const patch = deriveSessionMetaPatch({
|
||||
ctx,
|
||||
sessionKey,
|
||||
existing,
|
||||
groupResolution: params.groupResolution,
|
||||
});
|
||||
if (!patch) {
|
||||
return existing ?? null;
|
||||
}
|
||||
if (!existing && !createIfMissing) {
|
||||
return null;
|
||||
}
|
||||
const next = mergeSessionEntry(existing, patch);
|
||||
store[sessionKey] = next;
|
||||
return next;
|
||||
});
|
||||
return await updateSessionStore(
|
||||
storePath,
|
||||
(store) => {
|
||||
const existing = store[sessionKey];
|
||||
const patch = deriveSessionMetaPatch({
|
||||
ctx,
|
||||
sessionKey,
|
||||
existing,
|
||||
groupResolution: params.groupResolution,
|
||||
});
|
||||
if (!patch) {
|
||||
return existing ?? null;
|
||||
}
|
||||
if (!existing && !createIfMissing) {
|
||||
return null;
|
||||
}
|
||||
const next = mergeSessionEntry(existing, patch);
|
||||
store[sessionKey] = next;
|
||||
return next;
|
||||
},
|
||||
{ activeSessionKey: sessionKey },
|
||||
);
|
||||
}
|
||||
|
||||
export async function updateLastRoute(params: {
|
||||
@@ -488,7 +786,7 @@ export async function updateLastRoute(params: {
|
||||
metaPatch ? { ...basePatch, ...metaPatch } : basePatch,
|
||||
);
|
||||
store[sessionKey] = next;
|
||||
await saveSessionStoreUnlocked(storePath, store);
|
||||
await saveSessionStoreUnlocked(storePath, store, { activeSessionKey: sessionKey });
|
||||
return next;
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user