mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-19 06:57:26 +00:00
feat(diagnostics): add configurable stuck-session warning threshold
This commit is contained in:
@@ -418,6 +418,8 @@ export const FIELD_HELP: Record<string, string> = {
|
|||||||
'Enable targeted diagnostics logs by flag (e.g. ["telegram.http"]). Supports wildcards like "telegram.*" or "*".',
|
'Enable targeted diagnostics logs by flag (e.g. ["telegram.http"]). Supports wildcards like "telegram.*" or "*".',
|
||||||
"diagnostics.enabled":
|
"diagnostics.enabled":
|
||||||
"Master toggle for diagnostics instrumentation output in logs and telemetry wiring paths. Keep enabled for normal observability, and disable only in tightly constrained environments.",
|
"Master toggle for diagnostics instrumentation output in logs and telemetry wiring paths. Keep enabled for normal observability, and disable only in tightly constrained environments.",
|
||||||
|
"diagnostics.stuckSessionWarnMs":
|
||||||
|
"Age threshold in milliseconds for emitting stuck-session warnings while a session remains in processing state. Increase for long multi-tool turns to reduce false positives; decrease for faster hang detection.",
|
||||||
"diagnostics.otel.enabled":
|
"diagnostics.otel.enabled":
|
||||||
"Enables OpenTelemetry export pipeline for traces, metrics, and logs based on configured endpoint/protocol settings. Keep disabled unless your collector endpoint and auth are fully configured.",
|
"Enables OpenTelemetry export pipeline for traces, metrics, and logs based on configured endpoint/protocol settings. Keep disabled unless your collector endpoint and auth are fully configured.",
|
||||||
"diagnostics.otel.endpoint":
|
"diagnostics.otel.endpoint":
|
||||||
@@ -945,6 +947,8 @@ export const FIELD_HELP: Record<string, string> = {
|
|||||||
"Enables pre-compaction memory flush before the runtime performs stronger history reduction near token limits. Keep enabled unless you intentionally disable memory side effects in constrained environments.",
|
"Enables pre-compaction memory flush before the runtime performs stronger history reduction near token limits. Keep enabled unless you intentionally disable memory side effects in constrained environments.",
|
||||||
"agents.defaults.compaction.memoryFlush.softThresholdTokens":
|
"agents.defaults.compaction.memoryFlush.softThresholdTokens":
|
||||||
"Threshold distance to compaction (in tokens) that triggers pre-compaction memory flush execution. Use earlier thresholds for safer persistence, or tighter thresholds for lower flush frequency.",
|
"Threshold distance to compaction (in tokens) that triggers pre-compaction memory flush execution. Use earlier thresholds for safer persistence, or tighter thresholds for lower flush frequency.",
|
||||||
|
"agents.defaults.compaction.memoryFlush.forceFlushTranscriptBytes":
|
||||||
|
'Forces pre-compaction memory flush when transcript file size reaches this threshold (bytes or strings like "2mb"). Use this to prevent long-session hangs even when token counters are stale; set to 0 to disable.',
|
||||||
"agents.defaults.compaction.memoryFlush.prompt":
|
"agents.defaults.compaction.memoryFlush.prompt":
|
||||||
"User-prompt template used for the pre-compaction memory flush turn when generating memory candidates. Use this only when you need custom extraction instructions beyond the default memory flush behavior.",
|
"User-prompt template used for the pre-compaction memory flush turn when generating memory candidates. Use this only when you need custom extraction instructions beyond the default memory flush behavior.",
|
||||||
"agents.defaults.compaction.memoryFlush.systemPrompt":
|
"agents.defaults.compaction.memoryFlush.systemPrompt":
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ export const FIELD_LABELS: Record<string, string> = {
|
|||||||
"update.auto.betaCheckIntervalHours": "Auto Update Beta Check Interval (hours)",
|
"update.auto.betaCheckIntervalHours": "Auto Update Beta Check Interval (hours)",
|
||||||
"diagnostics.enabled": "Diagnostics Enabled",
|
"diagnostics.enabled": "Diagnostics Enabled",
|
||||||
"diagnostics.flags": "Diagnostics Flags",
|
"diagnostics.flags": "Diagnostics Flags",
|
||||||
|
"diagnostics.stuckSessionWarnMs": "Stuck Session Warning Threshold (ms)",
|
||||||
"diagnostics.otel.enabled": "OpenTelemetry Enabled",
|
"diagnostics.otel.enabled": "OpenTelemetry Enabled",
|
||||||
"diagnostics.otel.endpoint": "OpenTelemetry Endpoint",
|
"diagnostics.otel.endpoint": "OpenTelemetry Endpoint",
|
||||||
"diagnostics.otel.protocol": "OpenTelemetry Protocol",
|
"diagnostics.otel.protocol": "OpenTelemetry Protocol",
|
||||||
@@ -421,6 +422,8 @@ export const FIELD_LABELS: Record<string, string> = {
|
|||||||
"agents.defaults.compaction.memoryFlush.enabled": "Compaction Memory Flush Enabled",
|
"agents.defaults.compaction.memoryFlush.enabled": "Compaction Memory Flush Enabled",
|
||||||
"agents.defaults.compaction.memoryFlush.softThresholdTokens":
|
"agents.defaults.compaction.memoryFlush.softThresholdTokens":
|
||||||
"Compaction Memory Flush Soft Threshold",
|
"Compaction Memory Flush Soft Threshold",
|
||||||
|
"agents.defaults.compaction.memoryFlush.forceFlushTranscriptBytes":
|
||||||
|
"Compaction Memory Flush Transcript Size Threshold",
|
||||||
"agents.defaults.compaction.memoryFlush.prompt": "Compaction Memory Flush Prompt",
|
"agents.defaults.compaction.memoryFlush.prompt": "Compaction Memory Flush Prompt",
|
||||||
"agents.defaults.compaction.memoryFlush.systemPrompt": "Compaction Memory Flush System Prompt",
|
"agents.defaults.compaction.memoryFlush.systemPrompt": "Compaction Memory Flush System Prompt",
|
||||||
"agents.defaults.embeddedPi": "Embedded Pi",
|
"agents.defaults.embeddedPi": "Embedded Pi",
|
||||||
|
|||||||
@@ -205,6 +205,8 @@ export type DiagnosticsConfig = {
|
|||||||
enabled?: boolean;
|
enabled?: boolean;
|
||||||
/** Optional ad-hoc diagnostics flags (e.g. "telegram.http"). */
|
/** Optional ad-hoc diagnostics flags (e.g. "telegram.http"). */
|
||||||
flags?: string[];
|
flags?: string[];
|
||||||
|
/** Threshold in ms before a processing session logs "stuck session" diagnostics. */
|
||||||
|
stuckSessionWarnMs?: number;
|
||||||
otel?: DiagnosticsOtelConfig;
|
otel?: DiagnosticsOtelConfig;
|
||||||
cacheTrace?: DiagnosticsCacheTraceConfig;
|
cacheTrace?: DiagnosticsCacheTraceConfig;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -179,6 +179,7 @@ export const OpenClawSchema = z
|
|||||||
.object({
|
.object({
|
||||||
enabled: z.boolean().optional(),
|
enabled: z.boolean().optional(),
|
||||||
flags: z.array(z.string()).optional(),
|
flags: z.array(z.string()).optional(),
|
||||||
|
stuckSessionWarnMs: z.number().int().positive().optional(),
|
||||||
otel: z
|
otel: z
|
||||||
.object({
|
.object({
|
||||||
enabled: z.boolean().optional(),
|
enabled: z.boolean().optional(),
|
||||||
|
|||||||
@@ -371,7 +371,7 @@ export async function startGatewayServer(
|
|||||||
).config;
|
).config;
|
||||||
const diagnosticsEnabled = isDiagnosticsEnabled(cfgAtStart);
|
const diagnosticsEnabled = isDiagnosticsEnabled(cfgAtStart);
|
||||||
if (diagnosticsEnabled) {
|
if (diagnosticsEnabled) {
|
||||||
startDiagnosticHeartbeat();
|
startDiagnosticHeartbeat(cfgAtStart);
|
||||||
}
|
}
|
||||||
setGatewaySigusr1RestartPolicy({ allowExternal: isRestartEnabled(cfgAtStart) });
|
setGatewaySigusr1RestartPolicy({ allowExternal: isRestartEnabled(cfgAtStart) });
|
||||||
setPreRestartDeferralCheck(
|
setPreRestartDeferralCheck(
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import fs from "node:fs";
|
import fs from "node:fs";
|
||||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||||
|
import { onDiagnosticEvent, resetDiagnosticEventsForTest } from "../infra/diagnostic-events.js";
|
||||||
import {
|
import {
|
||||||
diagnosticSessionStates,
|
diagnosticSessionStates,
|
||||||
getDiagnosticSessionStateCountForTest,
|
getDiagnosticSessionStateCountForTest,
|
||||||
@@ -7,6 +8,12 @@ import {
|
|||||||
pruneDiagnosticSessionStates,
|
pruneDiagnosticSessionStates,
|
||||||
resetDiagnosticSessionStateForTest,
|
resetDiagnosticSessionStateForTest,
|
||||||
} from "./diagnostic-session-state.js";
|
} from "./diagnostic-session-state.js";
|
||||||
|
import {
|
||||||
|
logSessionStateChange,
|
||||||
|
resetDiagnosticStateForTest,
|
||||||
|
resolveStuckSessionWarnMs,
|
||||||
|
startDiagnosticHeartbeat,
|
||||||
|
} from "./diagnostic.js";
|
||||||
|
|
||||||
describe("diagnostic session state pruning", () => {
|
describe("diagnostic session state pruning", () => {
|
||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
@@ -74,3 +81,60 @@ describe("logger import side effects", () => {
|
|||||||
expect(mkdirSpy).not.toHaveBeenCalled();
|
expect(mkdirSpy).not.toHaveBeenCalled();
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("stuck session diagnostics threshold", () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
vi.useFakeTimers();
|
||||||
|
resetDiagnosticStateForTest();
|
||||||
|
resetDiagnosticEventsForTest();
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
resetDiagnosticEventsForTest();
|
||||||
|
resetDiagnosticStateForTest();
|
||||||
|
vi.useRealTimers();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("uses the configured diagnostics.stuckSessionWarnMs threshold", () => {
|
||||||
|
const events: Array<{ type: string }> = [];
|
||||||
|
const unsubscribe = onDiagnosticEvent((event) => {
|
||||||
|
events.push({ type: event.type });
|
||||||
|
});
|
||||||
|
try {
|
||||||
|
startDiagnosticHeartbeat({
|
||||||
|
diagnostics: {
|
||||||
|
enabled: true,
|
||||||
|
stuckSessionWarnMs: 30_000,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
logSessionStateChange({ sessionId: "s1", sessionKey: "main", state: "processing" });
|
||||||
|
vi.advanceTimersByTime(61_000);
|
||||||
|
} finally {
|
||||||
|
unsubscribe();
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(events.filter((event) => event.type === "session.stuck")).toHaveLength(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("falls back to default threshold when config is absent", () => {
|
||||||
|
const events: Array<{ type: string }> = [];
|
||||||
|
const unsubscribe = onDiagnosticEvent((event) => {
|
||||||
|
events.push({ type: event.type });
|
||||||
|
});
|
||||||
|
try {
|
||||||
|
startDiagnosticHeartbeat();
|
||||||
|
logSessionStateChange({ sessionId: "s2", sessionKey: "main", state: "processing" });
|
||||||
|
vi.advanceTimersByTime(31_000);
|
||||||
|
} finally {
|
||||||
|
unsubscribe();
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(events.filter((event) => event.type === "session.stuck")).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("uses default threshold for invalid values", () => {
|
||||||
|
expect(resolveStuckSessionWarnMs({ diagnostics: { stuckSessionWarnMs: -1 } })).toBe(120_000);
|
||||||
|
expect(resolveStuckSessionWarnMs({ diagnostics: { stuckSessionWarnMs: 0 } })).toBe(120_000);
|
||||||
|
expect(resolveStuckSessionWarnMs()).toBe(120_000);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import type { OpenClawConfig } from "../config/config.js";
|
||||||
import { emitDiagnosticEvent } from "../infra/diagnostic-events.js";
|
import { emitDiagnosticEvent } from "../infra/diagnostic-events.js";
|
||||||
import {
|
import {
|
||||||
diagnosticSessionStates,
|
diagnosticSessionStates,
|
||||||
@@ -20,11 +21,26 @@ const webhookStats = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let lastActivityAt = 0;
|
let lastActivityAt = 0;
|
||||||
|
const DEFAULT_STUCK_SESSION_WARN_MS = 120_000;
|
||||||
|
const MIN_STUCK_SESSION_WARN_MS = 1_000;
|
||||||
|
const MAX_STUCK_SESSION_WARN_MS = 24 * 60 * 60 * 1000;
|
||||||
|
|
||||||
function markActivity() {
|
function markActivity() {
|
||||||
lastActivityAt = Date.now();
|
lastActivityAt = Date.now();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function resolveStuckSessionWarnMs(config?: OpenClawConfig): number {
|
||||||
|
const raw = config?.diagnostics?.stuckSessionWarnMs;
|
||||||
|
if (typeof raw !== "number" || !Number.isFinite(raw)) {
|
||||||
|
return DEFAULT_STUCK_SESSION_WARN_MS;
|
||||||
|
}
|
||||||
|
const rounded = Math.floor(raw);
|
||||||
|
if (rounded < MIN_STUCK_SESSION_WARN_MS || rounded > MAX_STUCK_SESSION_WARN_MS) {
|
||||||
|
return DEFAULT_STUCK_SESSION_WARN_MS;
|
||||||
|
}
|
||||||
|
return rounded;
|
||||||
|
}
|
||||||
|
|
||||||
export function logWebhookReceived(params: {
|
export function logWebhookReceived(params: {
|
||||||
channel: string;
|
channel: string;
|
||||||
updateType?: string;
|
updateType?: string;
|
||||||
@@ -305,10 +321,11 @@ export function logActiveRuns() {
|
|||||||
|
|
||||||
let heartbeatInterval: NodeJS.Timeout | null = null;
|
let heartbeatInterval: NodeJS.Timeout | null = null;
|
||||||
|
|
||||||
export function startDiagnosticHeartbeat() {
|
export function startDiagnosticHeartbeat(config?: OpenClawConfig) {
|
||||||
if (heartbeatInterval) {
|
if (heartbeatInterval) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
const stuckSessionWarnMs = resolveStuckSessionWarnMs(config);
|
||||||
heartbeatInterval = setInterval(() => {
|
heartbeatInterval = setInterval(() => {
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
pruneDiagnosticSessionStates(now, true);
|
pruneDiagnosticSessionStates(now, true);
|
||||||
@@ -362,7 +379,7 @@ export function startDiagnosticHeartbeat() {
|
|||||||
|
|
||||||
for (const [, state] of diagnosticSessionStates) {
|
for (const [, state] of diagnosticSessionStates) {
|
||||||
const ageMs = now - state.lastActivity;
|
const ageMs = now - state.lastActivity;
|
||||||
if (state.state === "processing" && ageMs > 120_000) {
|
if (state.state === "processing" && ageMs > stuckSessionWarnMs) {
|
||||||
logSessionStuck({
|
logSessionStuck({
|
||||||
sessionId: state.sessionId,
|
sessionId: state.sessionId,
|
||||||
sessionKey: state.sessionKey,
|
sessionKey: state.sessionKey,
|
||||||
|
|||||||
@@ -120,7 +120,7 @@ export async function startTelegramWebhook(opts: {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (diagnosticsEnabled) {
|
if (diagnosticsEnabled) {
|
||||||
startDiagnosticHeartbeat();
|
startDiagnosticHeartbeat(opts.config);
|
||||||
}
|
}
|
||||||
|
|
||||||
const server = createServer((req, res) => {
|
const server = createServer((req, res) => {
|
||||||
|
|||||||
Reference in New Issue
Block a user