feat(diagnostics): add configurable stuck-session warning threshold

This commit is contained in:
Peter Steinberger
2026-03-02 00:07:02 +00:00
parent d729ab2150
commit 41cc46bbb4
8 changed files with 95 additions and 4 deletions

View File

@@ -418,6 +418,8 @@ export const FIELD_HELP: Record<string, string> = {
'Enable targeted diagnostics logs by flag (e.g. ["telegram.http"]). Supports wildcards like "telegram.*" or "*".', 'Enable targeted diagnostics logs by flag (e.g. ["telegram.http"]). Supports wildcards like "telegram.*" or "*".',
"diagnostics.enabled": "diagnostics.enabled":
"Master toggle for diagnostics instrumentation output in logs and telemetry wiring paths. Keep enabled for normal observability, and disable only in tightly constrained environments.", "Master toggle for diagnostics instrumentation output in logs and telemetry wiring paths. Keep enabled for normal observability, and disable only in tightly constrained environments.",
"diagnostics.stuckSessionWarnMs":
"Age threshold in milliseconds for emitting stuck-session warnings while a session remains in processing state. Increase for long multi-tool turns to reduce false positives; decrease for faster hang detection.",
"diagnostics.otel.enabled": "diagnostics.otel.enabled":
"Enables OpenTelemetry export pipeline for traces, metrics, and logs based on configured endpoint/protocol settings. Keep disabled unless your collector endpoint and auth are fully configured.", "Enables OpenTelemetry export pipeline for traces, metrics, and logs based on configured endpoint/protocol settings. Keep disabled unless your collector endpoint and auth are fully configured.",
"diagnostics.otel.endpoint": "diagnostics.otel.endpoint":
@@ -945,6 +947,8 @@ export const FIELD_HELP: Record<string, string> = {
"Enables pre-compaction memory flush before the runtime performs stronger history reduction near token limits. Keep enabled unless you intentionally disable memory side effects in constrained environments.", "Enables pre-compaction memory flush before the runtime performs stronger history reduction near token limits. Keep enabled unless you intentionally disable memory side effects in constrained environments.",
"agents.defaults.compaction.memoryFlush.softThresholdTokens": "agents.defaults.compaction.memoryFlush.softThresholdTokens":
"Threshold distance to compaction (in tokens) that triggers pre-compaction memory flush execution. Use earlier thresholds for safer persistence, or tighter thresholds for lower flush frequency.", "Threshold distance to compaction (in tokens) that triggers pre-compaction memory flush execution. Use earlier thresholds for safer persistence, or tighter thresholds for lower flush frequency.",
"agents.defaults.compaction.memoryFlush.forceFlushTranscriptBytes":
'Forces pre-compaction memory flush when transcript file size reaches this threshold (bytes or strings like "2mb"). Use this to prevent long-session hangs even when token counters are stale; set to 0 to disable.',
"agents.defaults.compaction.memoryFlush.prompt": "agents.defaults.compaction.memoryFlush.prompt":
"User-prompt template used for the pre-compaction memory flush turn when generating memory candidates. Use this only when you need custom extraction instructions beyond the default memory flush behavior.", "User-prompt template used for the pre-compaction memory flush turn when generating memory candidates. Use this only when you need custom extraction instructions beyond the default memory flush behavior.",
"agents.defaults.compaction.memoryFlush.systemPrompt": "agents.defaults.compaction.memoryFlush.systemPrompt":

View File

@@ -34,6 +34,7 @@ export const FIELD_LABELS: Record<string, string> = {
"update.auto.betaCheckIntervalHours": "Auto Update Beta Check Interval (hours)", "update.auto.betaCheckIntervalHours": "Auto Update Beta Check Interval (hours)",
"diagnostics.enabled": "Diagnostics Enabled", "diagnostics.enabled": "Diagnostics Enabled",
"diagnostics.flags": "Diagnostics Flags", "diagnostics.flags": "Diagnostics Flags",
"diagnostics.stuckSessionWarnMs": "Stuck Session Warning Threshold (ms)",
"diagnostics.otel.enabled": "OpenTelemetry Enabled", "diagnostics.otel.enabled": "OpenTelemetry Enabled",
"diagnostics.otel.endpoint": "OpenTelemetry Endpoint", "diagnostics.otel.endpoint": "OpenTelemetry Endpoint",
"diagnostics.otel.protocol": "OpenTelemetry Protocol", "diagnostics.otel.protocol": "OpenTelemetry Protocol",
@@ -421,6 +422,8 @@ export const FIELD_LABELS: Record<string, string> = {
"agents.defaults.compaction.memoryFlush.enabled": "Compaction Memory Flush Enabled", "agents.defaults.compaction.memoryFlush.enabled": "Compaction Memory Flush Enabled",
"agents.defaults.compaction.memoryFlush.softThresholdTokens": "agents.defaults.compaction.memoryFlush.softThresholdTokens":
"Compaction Memory Flush Soft Threshold", "Compaction Memory Flush Soft Threshold",
"agents.defaults.compaction.memoryFlush.forceFlushTranscriptBytes":
"Compaction Memory Flush Transcript Size Threshold",
"agents.defaults.compaction.memoryFlush.prompt": "Compaction Memory Flush Prompt", "agents.defaults.compaction.memoryFlush.prompt": "Compaction Memory Flush Prompt",
"agents.defaults.compaction.memoryFlush.systemPrompt": "Compaction Memory Flush System Prompt", "agents.defaults.compaction.memoryFlush.systemPrompt": "Compaction Memory Flush System Prompt",
"agents.defaults.embeddedPi": "Embedded Pi", "agents.defaults.embeddedPi": "Embedded Pi",

View File

@@ -205,6 +205,8 @@ export type DiagnosticsConfig = {
enabled?: boolean; enabled?: boolean;
/** Optional ad-hoc diagnostics flags (e.g. "telegram.http"). */ /** Optional ad-hoc diagnostics flags (e.g. "telegram.http"). */
flags?: string[]; flags?: string[];
/** Threshold in ms before a processing session logs "stuck session" diagnostics. */
stuckSessionWarnMs?: number;
otel?: DiagnosticsOtelConfig; otel?: DiagnosticsOtelConfig;
cacheTrace?: DiagnosticsCacheTraceConfig; cacheTrace?: DiagnosticsCacheTraceConfig;
}; };

View File

@@ -179,6 +179,7 @@ export const OpenClawSchema = z
.object({ .object({
enabled: z.boolean().optional(), enabled: z.boolean().optional(),
flags: z.array(z.string()).optional(), flags: z.array(z.string()).optional(),
stuckSessionWarnMs: z.number().int().positive().optional(),
otel: z otel: z
.object({ .object({
enabled: z.boolean().optional(), enabled: z.boolean().optional(),

View File

@@ -371,7 +371,7 @@ export async function startGatewayServer(
).config; ).config;
const diagnosticsEnabled = isDiagnosticsEnabled(cfgAtStart); const diagnosticsEnabled = isDiagnosticsEnabled(cfgAtStart);
if (diagnosticsEnabled) { if (diagnosticsEnabled) {
startDiagnosticHeartbeat(); startDiagnosticHeartbeat(cfgAtStart);
} }
setGatewaySigusr1RestartPolicy({ allowExternal: isRestartEnabled(cfgAtStart) }); setGatewaySigusr1RestartPolicy({ allowExternal: isRestartEnabled(cfgAtStart) });
setPreRestartDeferralCheck( setPreRestartDeferralCheck(

View File

@@ -1,5 +1,6 @@
import fs from "node:fs"; import fs from "node:fs";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { onDiagnosticEvent, resetDiagnosticEventsForTest } from "../infra/diagnostic-events.js";
import { import {
diagnosticSessionStates, diagnosticSessionStates,
getDiagnosticSessionStateCountForTest, getDiagnosticSessionStateCountForTest,
@@ -7,6 +8,12 @@ import {
pruneDiagnosticSessionStates, pruneDiagnosticSessionStates,
resetDiagnosticSessionStateForTest, resetDiagnosticSessionStateForTest,
} from "./diagnostic-session-state.js"; } from "./diagnostic-session-state.js";
import {
logSessionStateChange,
resetDiagnosticStateForTest,
resolveStuckSessionWarnMs,
startDiagnosticHeartbeat,
} from "./diagnostic.js";
describe("diagnostic session state pruning", () => { describe("diagnostic session state pruning", () => {
beforeEach(() => { beforeEach(() => {
@@ -74,3 +81,60 @@ describe("logger import side effects", () => {
expect(mkdirSpy).not.toHaveBeenCalled(); expect(mkdirSpy).not.toHaveBeenCalled();
}); });
}); });
describe("stuck session diagnostics threshold", () => {
beforeEach(() => {
vi.useFakeTimers();
resetDiagnosticStateForTest();
resetDiagnosticEventsForTest();
});
afterEach(() => {
resetDiagnosticEventsForTest();
resetDiagnosticStateForTest();
vi.useRealTimers();
});
it("uses the configured diagnostics.stuckSessionWarnMs threshold", () => {
const events: Array<{ type: string }> = [];
const unsubscribe = onDiagnosticEvent((event) => {
events.push({ type: event.type });
});
try {
startDiagnosticHeartbeat({
diagnostics: {
enabled: true,
stuckSessionWarnMs: 30_000,
},
});
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
vi.advanceTimersByTime(61_000);
} finally {
unsubscribe();
}
expect(events.filter((event) => event.type === "session.stuck")).toHaveLength(1);
});
it("falls back to default threshold when config is absent", () => {
const events: Array<{ type: string }> = [];
const unsubscribe = onDiagnosticEvent((event) => {
events.push({ type: event.type });
});
try {
startDiagnosticHeartbeat();
logSessionStateChange({ sessionId: "s2", sessionKey: "main", state: "processing" });
vi.advanceTimersByTime(31_000);
} finally {
unsubscribe();
}
expect(events.filter((event) => event.type === "session.stuck")).toHaveLength(0);
});
it("uses default threshold for invalid values", () => {
expect(resolveStuckSessionWarnMs({ diagnostics: { stuckSessionWarnMs: -1 } })).toBe(120_000);
expect(resolveStuckSessionWarnMs({ diagnostics: { stuckSessionWarnMs: 0 } })).toBe(120_000);
expect(resolveStuckSessionWarnMs()).toBe(120_000);
});
});

View File

@@ -1,3 +1,4 @@
import type { OpenClawConfig } from "../config/config.js";
import { emitDiagnosticEvent } from "../infra/diagnostic-events.js"; import { emitDiagnosticEvent } from "../infra/diagnostic-events.js";
import { import {
diagnosticSessionStates, diagnosticSessionStates,
@@ -20,11 +21,26 @@ const webhookStats = {
}; };
let lastActivityAt = 0; let lastActivityAt = 0;
const DEFAULT_STUCK_SESSION_WARN_MS = 120_000;
const MIN_STUCK_SESSION_WARN_MS = 1_000;
const MAX_STUCK_SESSION_WARN_MS = 24 * 60 * 60 * 1000;
function markActivity() { function markActivity() {
lastActivityAt = Date.now(); lastActivityAt = Date.now();
} }
export function resolveStuckSessionWarnMs(config?: OpenClawConfig): number {
const raw = config?.diagnostics?.stuckSessionWarnMs;
if (typeof raw !== "number" || !Number.isFinite(raw)) {
return DEFAULT_STUCK_SESSION_WARN_MS;
}
const rounded = Math.floor(raw);
if (rounded < MIN_STUCK_SESSION_WARN_MS || rounded > MAX_STUCK_SESSION_WARN_MS) {
return DEFAULT_STUCK_SESSION_WARN_MS;
}
return rounded;
}
export function logWebhookReceived(params: { export function logWebhookReceived(params: {
channel: string; channel: string;
updateType?: string; updateType?: string;
@@ -305,10 +321,11 @@ export function logActiveRuns() {
let heartbeatInterval: NodeJS.Timeout | null = null; let heartbeatInterval: NodeJS.Timeout | null = null;
export function startDiagnosticHeartbeat() { export function startDiagnosticHeartbeat(config?: OpenClawConfig) {
if (heartbeatInterval) { if (heartbeatInterval) {
return; return;
} }
const stuckSessionWarnMs = resolveStuckSessionWarnMs(config);
heartbeatInterval = setInterval(() => { heartbeatInterval = setInterval(() => {
const now = Date.now(); const now = Date.now();
pruneDiagnosticSessionStates(now, true); pruneDiagnosticSessionStates(now, true);
@@ -362,7 +379,7 @@ export function startDiagnosticHeartbeat() {
for (const [, state] of diagnosticSessionStates) { for (const [, state] of diagnosticSessionStates) {
const ageMs = now - state.lastActivity; const ageMs = now - state.lastActivity;
if (state.state === "processing" && ageMs > 120_000) { if (state.state === "processing" && ageMs > stuckSessionWarnMs) {
logSessionStuck({ logSessionStuck({
sessionId: state.sessionId, sessionId: state.sessionId,
sessionKey: state.sessionKey, sessionKey: state.sessionKey,

View File

@@ -120,7 +120,7 @@ export async function startTelegramWebhook(opts: {
}); });
if (diagnosticsEnabled) { if (diagnosticsEnabled) {
startDiagnosticHeartbeat(); startDiagnosticHeartbeat(opts.config);
} }
const server = createServer((req, res) => { const server = createServer((req, res) => {