mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-11 02:04:34 +00:00
fix(agent): prevent session lock deadlock on timeout during compaction (#9855)
Merged via /review-pr -> /prepare-pr -> /merge-pr.
Prepared head SHA: 64a28900f1
Co-authored-by: mverrilli <816450+mverrilli@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras
This commit is contained in:
@@ -91,6 +91,10 @@ import {
|
||||
import { splitSdkTools } from "../tool-split.js";
|
||||
import { describeUnknownError, mapThinkingLevel } from "../utils.js";
|
||||
import { flushPendingToolResultsAfterIdle } from "../wait-for-idle-before-flush.js";
|
||||
import {
|
||||
selectCompactionTimeoutSnapshot,
|
||||
shouldFlagCompactionTimeout,
|
||||
} from "./compaction-timeout.js";
|
||||
import { detectAndLoadPromptImages } from "./images.js";
|
||||
|
||||
export function injectHistoryImagesIntoMessages(
|
||||
@@ -665,6 +669,7 @@ export async function runEmbeddedAttempt(
|
||||
|
||||
let aborted = Boolean(params.abortSignal?.aborted);
|
||||
let timedOut = false;
|
||||
let timedOutDuringCompaction = false;
|
||||
const getAbortReason = (signal: AbortSignal): unknown =>
|
||||
"reason" in signal ? (signal as { reason?: unknown }).reason : undefined;
|
||||
const makeTimeoutAbortReason = (): Error => {
|
||||
@@ -769,6 +774,15 @@ export async function runEmbeddedAttempt(
|
||||
`embedded run timeout: runId=${params.runId} sessionId=${params.sessionId} timeoutMs=${params.timeoutMs}`,
|
||||
);
|
||||
}
|
||||
if (
|
||||
shouldFlagCompactionTimeout({
|
||||
isTimeout: true,
|
||||
isCompactionPendingOrRetrying: subscription.isCompacting(),
|
||||
isCompactionInFlight: activeSession.isCompacting,
|
||||
})
|
||||
) {
|
||||
timedOutDuringCompaction = true;
|
||||
}
|
||||
abortRun(true);
|
||||
if (!abortWarnTimer) {
|
||||
abortWarnTimer = setTimeout(() => {
|
||||
@@ -791,6 +805,15 @@ export async function runEmbeddedAttempt(
|
||||
const onAbort = () => {
|
||||
const reason = params.abortSignal ? getAbortReason(params.abortSignal) : undefined;
|
||||
const timeout = reason ? isTimeoutError(reason) : false;
|
||||
if (
|
||||
shouldFlagCompactionTimeout({
|
||||
isTimeout: timeout,
|
||||
isCompactionPendingOrRetrying: subscription.isCompacting(),
|
||||
isCompactionInFlight: activeSession.isCompacting,
|
||||
})
|
||||
) {
|
||||
timedOutDuringCompaction = true;
|
||||
}
|
||||
abortRun(timeout, reason);
|
||||
};
|
||||
if (params.abortSignal) {
|
||||
@@ -939,13 +962,28 @@ export async function runEmbeddedAttempt(
|
||||
);
|
||||
}
|
||||
|
||||
// Capture snapshot before compaction wait so we have complete messages if timeout occurs
|
||||
// Check compaction state before and after to avoid race condition where compaction starts during capture
|
||||
// Use session state (not subscription) for snapshot decisions - need instantaneous compaction status
|
||||
const wasCompactingBefore = activeSession.isCompacting;
|
||||
const snapshot = activeSession.messages.slice();
|
||||
const wasCompactingAfter = activeSession.isCompacting;
|
||||
// Only trust snapshot if compaction wasn't running before or after capture
|
||||
const preCompactionSnapshot = wasCompactingBefore || wasCompactingAfter ? null : snapshot;
|
||||
const preCompactionSessionId = activeSession.sessionId;
|
||||
|
||||
try {
|
||||
await waitForCompactionRetry();
|
||||
await abortable(waitForCompactionRetry());
|
||||
} catch (err) {
|
||||
if (isRunnerAbortError(err)) {
|
||||
if (!promptError) {
|
||||
promptError = err;
|
||||
}
|
||||
if (!isProbeSession) {
|
||||
log.debug(
|
||||
`compaction wait aborted: runId=${params.runId} sessionId=${params.sessionId}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
throw err;
|
||||
}
|
||||
@@ -956,27 +994,51 @@ export async function runEmbeddedAttempt(
|
||||
// inserted between compaction and the next prompt — breaking the
|
||||
// prepareCompaction() guard that checks the last entry type, leading to
|
||||
// double-compaction. See: https://github.com/openclaw/openclaw/issues/9282
|
||||
const shouldTrackCacheTtl =
|
||||
params.config?.agents?.defaults?.contextPruning?.mode === "cache-ttl" &&
|
||||
isCacheTtlEligibleProvider(params.provider, params.modelId);
|
||||
if (shouldTrackCacheTtl) {
|
||||
appendCacheTtlTimestamp(sessionManager, {
|
||||
timestamp: Date.now(),
|
||||
provider: params.provider,
|
||||
modelId: params.modelId,
|
||||
});
|
||||
// Skip when timed out during compaction — session state may be inconsistent.
|
||||
if (!timedOutDuringCompaction) {
|
||||
const shouldTrackCacheTtl =
|
||||
params.config?.agents?.defaults?.contextPruning?.mode === "cache-ttl" &&
|
||||
isCacheTtlEligibleProvider(params.provider, params.modelId);
|
||||
if (shouldTrackCacheTtl) {
|
||||
appendCacheTtlTimestamp(sessionManager, {
|
||||
timestamp: Date.now(),
|
||||
provider: params.provider,
|
||||
modelId: params.modelId,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
messagesSnapshot = activeSession.messages.slice();
|
||||
sessionIdUsed = activeSession.sessionId;
|
||||
// If timeout occurred during compaction, use pre-compaction snapshot when available
|
||||
// (compaction restructures messages but does not add user/assistant turns).
|
||||
const snapshotSelection = selectCompactionTimeoutSnapshot({
|
||||
timedOutDuringCompaction,
|
||||
preCompactionSnapshot,
|
||||
preCompactionSessionId,
|
||||
currentSnapshot: activeSession.messages.slice(),
|
||||
currentSessionId: activeSession.sessionId,
|
||||
});
|
||||
if (timedOutDuringCompaction) {
|
||||
if (!isProbeSession) {
|
||||
log.warn(
|
||||
`using ${snapshotSelection.source} snapshot: timed out during compaction runId=${params.runId} sessionId=${params.sessionId}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
messagesSnapshot = snapshotSelection.messagesSnapshot;
|
||||
sessionIdUsed = snapshotSelection.sessionIdUsed;
|
||||
cacheTrace?.recordStage("session:after", {
|
||||
messages: messagesSnapshot,
|
||||
note: promptError ? "prompt error" : undefined,
|
||||
note: timedOutDuringCompaction
|
||||
? "compaction timeout"
|
||||
: promptError
|
||||
? "prompt error"
|
||||
: undefined,
|
||||
});
|
||||
anthropicPayloadLogger?.recordUsage(messagesSnapshot, promptError);
|
||||
|
||||
// Run agent_end hooks to allow plugins to analyze the conversation
|
||||
// This is fire-and-forget, so we don't await
|
||||
// Run even on compaction timeout so plugins can log/cleanup
|
||||
if (hookRunner?.hasHooks("agent_end")) {
|
||||
hookRunner
|
||||
.runAgentEnd(
|
||||
@@ -1003,7 +1065,21 @@ export async function runEmbeddedAttempt(
|
||||
if (abortWarnTimer) {
|
||||
clearTimeout(abortWarnTimer);
|
||||
}
|
||||
unsubscribe();
|
||||
if (!isProbeSession && (aborted || timedOut) && !timedOutDuringCompaction) {
|
||||
log.debug(
|
||||
`run cleanup: runId=${params.runId} sessionId=${params.sessionId} aborted=${aborted} timedOut=${timedOut}`,
|
||||
);
|
||||
}
|
||||
try {
|
||||
unsubscribe();
|
||||
} catch (err) {
|
||||
// unsubscribe() should never throw; if it does, it indicates a serious bug.
|
||||
// Log at error level to ensure visibility, but don't rethrow in finally block
|
||||
// as it would mask any exception from the try block above.
|
||||
log.error(
|
||||
`CRITICAL: unsubscribe failed, possible resource leak: runId=${params.runId} ${String(err)}`,
|
||||
);
|
||||
}
|
||||
clearActiveEmbeddedRun(params.sessionId, queueHandle);
|
||||
params.abortSignal?.removeEventListener?.("abort", onAbort);
|
||||
}
|
||||
@@ -1023,6 +1099,7 @@ export async function runEmbeddedAttempt(
|
||||
return {
|
||||
aborted,
|
||||
timedOut,
|
||||
timedOutDuringCompaction,
|
||||
promptError,
|
||||
sessionIdUsed,
|
||||
systemPromptReport,
|
||||
|
||||
Reference in New Issue
Block a user