mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-08 04:51:25 +00:00
fix(agent): prevent session lock deadlock on timeout during compaction (#9855)
Merged via /review-pr -> /prepare-pr -> /merge-pr.
Prepared head SHA: 64a28900f1
Co-authored-by: mverrilli <816450+mverrilli@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras
This commit is contained in:
@@ -206,6 +206,7 @@ function makeAttemptResult(
|
||||
return {
|
||||
aborted: false,
|
||||
timedOut: false,
|
||||
timedOutDuringCompaction: false,
|
||||
promptError: null,
|
||||
sessionIdUsed: "test-session",
|
||||
assistantTexts: ["Hello!"],
|
||||
|
||||
@@ -480,7 +480,14 @@ export async function runEmbeddedPiAgent(
|
||||
enforceFinalTag: params.enforceFinalTag,
|
||||
});
|
||||
|
||||
const { aborted, promptError, timedOut, sessionIdUsed, lastAssistant } = attempt;
|
||||
const {
|
||||
aborted,
|
||||
promptError,
|
||||
timedOut,
|
||||
timedOutDuringCompaction,
|
||||
sessionIdUsed,
|
||||
lastAssistant,
|
||||
} = attempt;
|
||||
const lastAssistantUsage = normalizeUsage(lastAssistant?.usage as UsageLike);
|
||||
const attemptUsage = attempt.attemptUsage ?? lastAssistantUsage;
|
||||
mergeUsageIntoAccumulator(usageAccumulator, attemptUsage);
|
||||
@@ -801,7 +808,9 @@ export async function runEmbeddedPiAgent(
|
||||
}
|
||||
|
||||
// Treat timeout as potential rate limit (Antigravity hangs on rate limit)
|
||||
const shouldRotate = (!aborted && failoverFailure) || timedOut;
|
||||
// But exclude post-prompt compaction timeouts (model succeeded; no profile issue)
|
||||
const shouldRotate =
|
||||
(!aborted && failoverFailure) || (timedOut && !timedOutDuringCompaction);
|
||||
|
||||
if (shouldRotate) {
|
||||
if (lastProfileId) {
|
||||
|
||||
@@ -91,6 +91,10 @@ import {
|
||||
import { splitSdkTools } from "../tool-split.js";
|
||||
import { describeUnknownError, mapThinkingLevel } from "../utils.js";
|
||||
import { flushPendingToolResultsAfterIdle } from "../wait-for-idle-before-flush.js";
|
||||
import {
|
||||
selectCompactionTimeoutSnapshot,
|
||||
shouldFlagCompactionTimeout,
|
||||
} from "./compaction-timeout.js";
|
||||
import { detectAndLoadPromptImages } from "./images.js";
|
||||
|
||||
export function injectHistoryImagesIntoMessages(
|
||||
@@ -665,6 +669,7 @@ export async function runEmbeddedAttempt(
|
||||
|
||||
let aborted = Boolean(params.abortSignal?.aborted);
|
||||
let timedOut = false;
|
||||
let timedOutDuringCompaction = false;
|
||||
const getAbortReason = (signal: AbortSignal): unknown =>
|
||||
"reason" in signal ? (signal as { reason?: unknown }).reason : undefined;
|
||||
const makeTimeoutAbortReason = (): Error => {
|
||||
@@ -769,6 +774,15 @@ export async function runEmbeddedAttempt(
|
||||
`embedded run timeout: runId=${params.runId} sessionId=${params.sessionId} timeoutMs=${params.timeoutMs}`,
|
||||
);
|
||||
}
|
||||
if (
|
||||
shouldFlagCompactionTimeout({
|
||||
isTimeout: true,
|
||||
isCompactionPendingOrRetrying: subscription.isCompacting(),
|
||||
isCompactionInFlight: activeSession.isCompacting,
|
||||
})
|
||||
) {
|
||||
timedOutDuringCompaction = true;
|
||||
}
|
||||
abortRun(true);
|
||||
if (!abortWarnTimer) {
|
||||
abortWarnTimer = setTimeout(() => {
|
||||
@@ -791,6 +805,15 @@ export async function runEmbeddedAttempt(
|
||||
const onAbort = () => {
|
||||
const reason = params.abortSignal ? getAbortReason(params.abortSignal) : undefined;
|
||||
const timeout = reason ? isTimeoutError(reason) : false;
|
||||
if (
|
||||
shouldFlagCompactionTimeout({
|
||||
isTimeout: timeout,
|
||||
isCompactionPendingOrRetrying: subscription.isCompacting(),
|
||||
isCompactionInFlight: activeSession.isCompacting,
|
||||
})
|
||||
) {
|
||||
timedOutDuringCompaction = true;
|
||||
}
|
||||
abortRun(timeout, reason);
|
||||
};
|
||||
if (params.abortSignal) {
|
||||
@@ -939,13 +962,28 @@ export async function runEmbeddedAttempt(
|
||||
);
|
||||
}
|
||||
|
||||
// Capture snapshot before compaction wait so we have complete messages if timeout occurs
|
||||
// Check compaction state before and after to avoid race condition where compaction starts during capture
|
||||
// Use session state (not subscription) for snapshot decisions - need instantaneous compaction status
|
||||
const wasCompactingBefore = activeSession.isCompacting;
|
||||
const snapshot = activeSession.messages.slice();
|
||||
const wasCompactingAfter = activeSession.isCompacting;
|
||||
// Only trust snapshot if compaction wasn't running before or after capture
|
||||
const preCompactionSnapshot = wasCompactingBefore || wasCompactingAfter ? null : snapshot;
|
||||
const preCompactionSessionId = activeSession.sessionId;
|
||||
|
||||
try {
|
||||
await waitForCompactionRetry();
|
||||
await abortable(waitForCompactionRetry());
|
||||
} catch (err) {
|
||||
if (isRunnerAbortError(err)) {
|
||||
if (!promptError) {
|
||||
promptError = err;
|
||||
}
|
||||
if (!isProbeSession) {
|
||||
log.debug(
|
||||
`compaction wait aborted: runId=${params.runId} sessionId=${params.sessionId}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
throw err;
|
||||
}
|
||||
@@ -956,27 +994,51 @@ export async function runEmbeddedAttempt(
|
||||
// inserted between compaction and the next prompt — breaking the
|
||||
// prepareCompaction() guard that checks the last entry type, leading to
|
||||
// double-compaction. See: https://github.com/openclaw/openclaw/issues/9282
|
||||
const shouldTrackCacheTtl =
|
||||
params.config?.agents?.defaults?.contextPruning?.mode === "cache-ttl" &&
|
||||
isCacheTtlEligibleProvider(params.provider, params.modelId);
|
||||
if (shouldTrackCacheTtl) {
|
||||
appendCacheTtlTimestamp(sessionManager, {
|
||||
timestamp: Date.now(),
|
||||
provider: params.provider,
|
||||
modelId: params.modelId,
|
||||
});
|
||||
// Skip when timed out during compaction — session state may be inconsistent.
|
||||
if (!timedOutDuringCompaction) {
|
||||
const shouldTrackCacheTtl =
|
||||
params.config?.agents?.defaults?.contextPruning?.mode === "cache-ttl" &&
|
||||
isCacheTtlEligibleProvider(params.provider, params.modelId);
|
||||
if (shouldTrackCacheTtl) {
|
||||
appendCacheTtlTimestamp(sessionManager, {
|
||||
timestamp: Date.now(),
|
||||
provider: params.provider,
|
||||
modelId: params.modelId,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
messagesSnapshot = activeSession.messages.slice();
|
||||
sessionIdUsed = activeSession.sessionId;
|
||||
// If timeout occurred during compaction, use pre-compaction snapshot when available
|
||||
// (compaction restructures messages but does not add user/assistant turns).
|
||||
const snapshotSelection = selectCompactionTimeoutSnapshot({
|
||||
timedOutDuringCompaction,
|
||||
preCompactionSnapshot,
|
||||
preCompactionSessionId,
|
||||
currentSnapshot: activeSession.messages.slice(),
|
||||
currentSessionId: activeSession.sessionId,
|
||||
});
|
||||
if (timedOutDuringCompaction) {
|
||||
if (!isProbeSession) {
|
||||
log.warn(
|
||||
`using ${snapshotSelection.source} snapshot: timed out during compaction runId=${params.runId} sessionId=${params.sessionId}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
messagesSnapshot = snapshotSelection.messagesSnapshot;
|
||||
sessionIdUsed = snapshotSelection.sessionIdUsed;
|
||||
cacheTrace?.recordStage("session:after", {
|
||||
messages: messagesSnapshot,
|
||||
note: promptError ? "prompt error" : undefined,
|
||||
note: timedOutDuringCompaction
|
||||
? "compaction timeout"
|
||||
: promptError
|
||||
? "prompt error"
|
||||
: undefined,
|
||||
});
|
||||
anthropicPayloadLogger?.recordUsage(messagesSnapshot, promptError);
|
||||
|
||||
// Run agent_end hooks to allow plugins to analyze the conversation
|
||||
// This is fire-and-forget, so we don't await
|
||||
// Run even on compaction timeout so plugins can log/cleanup
|
||||
if (hookRunner?.hasHooks("agent_end")) {
|
||||
hookRunner
|
||||
.runAgentEnd(
|
||||
@@ -1003,7 +1065,21 @@ export async function runEmbeddedAttempt(
|
||||
if (abortWarnTimer) {
|
||||
clearTimeout(abortWarnTimer);
|
||||
}
|
||||
unsubscribe();
|
||||
if (!isProbeSession && (aborted || timedOut) && !timedOutDuringCompaction) {
|
||||
log.debug(
|
||||
`run cleanup: runId=${params.runId} sessionId=${params.sessionId} aborted=${aborted} timedOut=${timedOut}`,
|
||||
);
|
||||
}
|
||||
try {
|
||||
unsubscribe();
|
||||
} catch (err) {
|
||||
// unsubscribe() should never throw; if it does, it indicates a serious bug.
|
||||
// Log at error level to ensure visibility, but don't rethrow in finally block
|
||||
// as it would mask any exception from the try block above.
|
||||
log.error(
|
||||
`CRITICAL: unsubscribe failed, possible resource leak: runId=${params.runId} ${String(err)}`,
|
||||
);
|
||||
}
|
||||
clearActiveEmbeddedRun(params.sessionId, queueHandle);
|
||||
params.abortSignal?.removeEventListener?.("abort", onAbort);
|
||||
}
|
||||
@@ -1023,6 +1099,7 @@ export async function runEmbeddedAttempt(
|
||||
return {
|
||||
aborted,
|
||||
timedOut,
|
||||
timedOutDuringCompaction,
|
||||
promptError,
|
||||
sessionIdUsed,
|
||||
systemPromptReport,
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
selectCompactionTimeoutSnapshot,
|
||||
shouldFlagCompactionTimeout,
|
||||
} from "./compaction-timeout.js";
|
||||
|
||||
describe("compaction-timeout helpers", () => {
|
||||
it("flags compaction timeout consistently for internal and external timeout sources", () => {
|
||||
const internalTimer = shouldFlagCompactionTimeout({
|
||||
isTimeout: true,
|
||||
isCompactionPendingOrRetrying: true,
|
||||
isCompactionInFlight: false,
|
||||
});
|
||||
const externalAbort = shouldFlagCompactionTimeout({
|
||||
isTimeout: true,
|
||||
isCompactionPendingOrRetrying: true,
|
||||
isCompactionInFlight: false,
|
||||
});
|
||||
expect(internalTimer).toBe(true);
|
||||
expect(externalAbort).toBe(true);
|
||||
});
|
||||
|
||||
it("does not flag when timeout is false", () => {
|
||||
expect(
|
||||
shouldFlagCompactionTimeout({
|
||||
isTimeout: false,
|
||||
isCompactionPendingOrRetrying: true,
|
||||
isCompactionInFlight: true,
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it("uses pre-compaction snapshot when compaction timeout occurs", () => {
|
||||
const pre = [{ role: "assistant", content: "pre" }] as const;
|
||||
const current = [{ role: "assistant", content: "current" }] as const;
|
||||
const selected = selectCompactionTimeoutSnapshot({
|
||||
timedOutDuringCompaction: true,
|
||||
preCompactionSnapshot: [...pre],
|
||||
preCompactionSessionId: "session-pre",
|
||||
currentSnapshot: [...current],
|
||||
currentSessionId: "session-current",
|
||||
});
|
||||
expect(selected.source).toBe("pre-compaction");
|
||||
expect(selected.sessionIdUsed).toBe("session-pre");
|
||||
expect(selected.messagesSnapshot).toEqual(pre);
|
||||
});
|
||||
|
||||
it("falls back to current snapshot when pre-compaction snapshot is unavailable", () => {
|
||||
const current = [{ role: "assistant", content: "current" }] as const;
|
||||
const selected = selectCompactionTimeoutSnapshot({
|
||||
timedOutDuringCompaction: true,
|
||||
preCompactionSnapshot: null,
|
||||
preCompactionSessionId: "session-pre",
|
||||
currentSnapshot: [...current],
|
||||
currentSessionId: "session-current",
|
||||
});
|
||||
expect(selected.source).toBe("current");
|
||||
expect(selected.sessionIdUsed).toBe("session-current");
|
||||
expect(selected.messagesSnapshot).toEqual(current);
|
||||
});
|
||||
});
|
||||
54
src/agents/pi-embedded-runner/run/compaction-timeout.ts
Normal file
54
src/agents/pi-embedded-runner/run/compaction-timeout.ts
Normal file
@@ -0,0 +1,54 @@
|
||||
import type { AgentMessage } from "@mariozechner/pi-agent-core";
|
||||
|
||||
export type CompactionTimeoutSignal = {
|
||||
isTimeout: boolean;
|
||||
isCompactionPendingOrRetrying: boolean;
|
||||
isCompactionInFlight: boolean;
|
||||
};
|
||||
|
||||
export function shouldFlagCompactionTimeout(signal: CompactionTimeoutSignal): boolean {
|
||||
if (!signal.isTimeout) {
|
||||
return false;
|
||||
}
|
||||
return signal.isCompactionPendingOrRetrying || signal.isCompactionInFlight;
|
||||
}
|
||||
|
||||
export type SnapshotSelectionParams = {
|
||||
timedOutDuringCompaction: boolean;
|
||||
preCompactionSnapshot: AgentMessage[] | null;
|
||||
preCompactionSessionId: string;
|
||||
currentSnapshot: AgentMessage[];
|
||||
currentSessionId: string;
|
||||
};
|
||||
|
||||
export type SnapshotSelection = {
|
||||
messagesSnapshot: AgentMessage[];
|
||||
sessionIdUsed: string;
|
||||
source: "pre-compaction" | "current";
|
||||
};
|
||||
|
||||
export function selectCompactionTimeoutSnapshot(
|
||||
params: SnapshotSelectionParams,
|
||||
): SnapshotSelection {
|
||||
if (!params.timedOutDuringCompaction) {
|
||||
return {
|
||||
messagesSnapshot: params.currentSnapshot,
|
||||
sessionIdUsed: params.currentSessionId,
|
||||
source: "current",
|
||||
};
|
||||
}
|
||||
|
||||
if (params.preCompactionSnapshot) {
|
||||
return {
|
||||
messagesSnapshot: params.preCompactionSnapshot,
|
||||
sessionIdUsed: params.preCompactionSessionId,
|
||||
source: "pre-compaction",
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
messagesSnapshot: params.currentSnapshot,
|
||||
sessionIdUsed: params.currentSessionId,
|
||||
source: "current",
|
||||
};
|
||||
}
|
||||
@@ -24,6 +24,8 @@ export type EmbeddedRunAttemptParams = EmbeddedRunAttemptBase & {
|
||||
export type EmbeddedRunAttemptResult = {
|
||||
aborted: boolean;
|
||||
timedOut: boolean;
|
||||
/** True if the timeout occurred while compaction was in progress or pending. */
|
||||
timedOutDuringCompaction: boolean;
|
||||
promptError: unknown;
|
||||
sessionIdUsed: string;
|
||||
systemPromptReport?: SessionSystemPromptReport;
|
||||
|
||||
Reference in New Issue
Block a user