fix(agent): prevent session lock deadlock on timeout during compaction (#9855)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: 64a28900f1
Co-authored-by: mverrilli <816450+mverrilli@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras
This commit is contained in:
Michael Verrilli
2026-02-14 14:24:20 -05:00
committed by GitHub
parent f537bd1796
commit e6f67d5f31
11 changed files with 365 additions and 20 deletions

View File

@@ -206,6 +206,7 @@ function makeAttemptResult(
return {
aborted: false,
timedOut: false,
timedOutDuringCompaction: false,
promptError: null,
sessionIdUsed: "test-session",
assistantTexts: ["Hello!"],

View File

@@ -480,7 +480,14 @@ export async function runEmbeddedPiAgent(
enforceFinalTag: params.enforceFinalTag,
});
const { aborted, promptError, timedOut, sessionIdUsed, lastAssistant } = attempt;
const {
aborted,
promptError,
timedOut,
timedOutDuringCompaction,
sessionIdUsed,
lastAssistant,
} = attempt;
const lastAssistantUsage = normalizeUsage(lastAssistant?.usage as UsageLike);
const attemptUsage = attempt.attemptUsage ?? lastAssistantUsage;
mergeUsageIntoAccumulator(usageAccumulator, attemptUsage);
@@ -801,7 +808,9 @@ export async function runEmbeddedPiAgent(
}
// Treat timeout as potential rate limit (Antigravity hangs on rate limit)
const shouldRotate = (!aborted && failoverFailure) || timedOut;
// But exclude post-prompt compaction timeouts (model succeeded; no profile issue)
const shouldRotate =
(!aborted && failoverFailure) || (timedOut && !timedOutDuringCompaction);
if (shouldRotate) {
if (lastProfileId) {

View File

@@ -91,6 +91,10 @@ import {
import { splitSdkTools } from "../tool-split.js";
import { describeUnknownError, mapThinkingLevel } from "../utils.js";
import { flushPendingToolResultsAfterIdle } from "../wait-for-idle-before-flush.js";
import {
selectCompactionTimeoutSnapshot,
shouldFlagCompactionTimeout,
} from "./compaction-timeout.js";
import { detectAndLoadPromptImages } from "./images.js";
export function injectHistoryImagesIntoMessages(
@@ -665,6 +669,7 @@ export async function runEmbeddedAttempt(
let aborted = Boolean(params.abortSignal?.aborted);
let timedOut = false;
let timedOutDuringCompaction = false;
const getAbortReason = (signal: AbortSignal): unknown =>
"reason" in signal ? (signal as { reason?: unknown }).reason : undefined;
const makeTimeoutAbortReason = (): Error => {
@@ -769,6 +774,15 @@ export async function runEmbeddedAttempt(
`embedded run timeout: runId=${params.runId} sessionId=${params.sessionId} timeoutMs=${params.timeoutMs}`,
);
}
if (
shouldFlagCompactionTimeout({
isTimeout: true,
isCompactionPendingOrRetrying: subscription.isCompacting(),
isCompactionInFlight: activeSession.isCompacting,
})
) {
timedOutDuringCompaction = true;
}
abortRun(true);
if (!abortWarnTimer) {
abortWarnTimer = setTimeout(() => {
@@ -791,6 +805,15 @@ export async function runEmbeddedAttempt(
const onAbort = () => {
const reason = params.abortSignal ? getAbortReason(params.abortSignal) : undefined;
const timeout = reason ? isTimeoutError(reason) : false;
if (
shouldFlagCompactionTimeout({
isTimeout: timeout,
isCompactionPendingOrRetrying: subscription.isCompacting(),
isCompactionInFlight: activeSession.isCompacting,
})
) {
timedOutDuringCompaction = true;
}
abortRun(timeout, reason);
};
if (params.abortSignal) {
@@ -939,13 +962,28 @@ export async function runEmbeddedAttempt(
);
}
// Capture snapshot before compaction wait so we have complete messages if timeout occurs
// Check compaction state before and after to avoid race condition where compaction starts during capture
// Use session state (not subscription) for snapshot decisions - need instantaneous compaction status
const wasCompactingBefore = activeSession.isCompacting;
const snapshot = activeSession.messages.slice();
const wasCompactingAfter = activeSession.isCompacting;
// Only trust snapshot if compaction wasn't running before or after capture
const preCompactionSnapshot = wasCompactingBefore || wasCompactingAfter ? null : snapshot;
const preCompactionSessionId = activeSession.sessionId;
try {
await waitForCompactionRetry();
await abortable(waitForCompactionRetry());
} catch (err) {
if (isRunnerAbortError(err)) {
if (!promptError) {
promptError = err;
}
if (!isProbeSession) {
log.debug(
`compaction wait aborted: runId=${params.runId} sessionId=${params.sessionId}`,
);
}
} else {
throw err;
}
@@ -956,27 +994,51 @@ export async function runEmbeddedAttempt(
// inserted between compaction and the next prompt — breaking the
// prepareCompaction() guard that checks the last entry type, leading to
// double-compaction. See: https://github.com/openclaw/openclaw/issues/9282
const shouldTrackCacheTtl =
params.config?.agents?.defaults?.contextPruning?.mode === "cache-ttl" &&
isCacheTtlEligibleProvider(params.provider, params.modelId);
if (shouldTrackCacheTtl) {
appendCacheTtlTimestamp(sessionManager, {
timestamp: Date.now(),
provider: params.provider,
modelId: params.modelId,
});
// Skip when timed out during compaction — session state may be inconsistent.
if (!timedOutDuringCompaction) {
const shouldTrackCacheTtl =
params.config?.agents?.defaults?.contextPruning?.mode === "cache-ttl" &&
isCacheTtlEligibleProvider(params.provider, params.modelId);
if (shouldTrackCacheTtl) {
appendCacheTtlTimestamp(sessionManager, {
timestamp: Date.now(),
provider: params.provider,
modelId: params.modelId,
});
}
}
messagesSnapshot = activeSession.messages.slice();
sessionIdUsed = activeSession.sessionId;
// If timeout occurred during compaction, use pre-compaction snapshot when available
// (compaction restructures messages but does not add user/assistant turns).
const snapshotSelection = selectCompactionTimeoutSnapshot({
timedOutDuringCompaction,
preCompactionSnapshot,
preCompactionSessionId,
currentSnapshot: activeSession.messages.slice(),
currentSessionId: activeSession.sessionId,
});
if (timedOutDuringCompaction) {
if (!isProbeSession) {
log.warn(
`using ${snapshotSelection.source} snapshot: timed out during compaction runId=${params.runId} sessionId=${params.sessionId}`,
);
}
}
messagesSnapshot = snapshotSelection.messagesSnapshot;
sessionIdUsed = snapshotSelection.sessionIdUsed;
cacheTrace?.recordStage("session:after", {
messages: messagesSnapshot,
note: promptError ? "prompt error" : undefined,
note: timedOutDuringCompaction
? "compaction timeout"
: promptError
? "prompt error"
: undefined,
});
anthropicPayloadLogger?.recordUsage(messagesSnapshot, promptError);
// Run agent_end hooks to allow plugins to analyze the conversation
// This is fire-and-forget, so we don't await
// Run even on compaction timeout so plugins can log/cleanup
if (hookRunner?.hasHooks("agent_end")) {
hookRunner
.runAgentEnd(
@@ -1003,7 +1065,21 @@ export async function runEmbeddedAttempt(
if (abortWarnTimer) {
clearTimeout(abortWarnTimer);
}
unsubscribe();
if (!isProbeSession && (aborted || timedOut) && !timedOutDuringCompaction) {
log.debug(
`run cleanup: runId=${params.runId} sessionId=${params.sessionId} aborted=${aborted} timedOut=${timedOut}`,
);
}
try {
unsubscribe();
} catch (err) {
// unsubscribe() should never throw; if it does, it indicates a serious bug.
// Log at error level to ensure visibility, but don't rethrow in finally block
// as it would mask any exception from the try block above.
log.error(
`CRITICAL: unsubscribe failed, possible resource leak: runId=${params.runId} ${String(err)}`,
);
}
clearActiveEmbeddedRun(params.sessionId, queueHandle);
params.abortSignal?.removeEventListener?.("abort", onAbort);
}
@@ -1023,6 +1099,7 @@ export async function runEmbeddedAttempt(
return {
aborted,
timedOut,
timedOutDuringCompaction,
promptError,
sessionIdUsed,
systemPromptReport,

View File

@@ -0,0 +1,61 @@
import { describe, expect, it } from "vitest";
import {
selectCompactionTimeoutSnapshot,
shouldFlagCompactionTimeout,
} from "./compaction-timeout.js";
describe("compaction-timeout helpers", () => {
it("flags compaction timeout consistently for internal and external timeout sources", () => {
const internalTimer = shouldFlagCompactionTimeout({
isTimeout: true,
isCompactionPendingOrRetrying: true,
isCompactionInFlight: false,
});
const externalAbort = shouldFlagCompactionTimeout({
isTimeout: true,
isCompactionPendingOrRetrying: true,
isCompactionInFlight: false,
});
expect(internalTimer).toBe(true);
expect(externalAbort).toBe(true);
});
it("does not flag when timeout is false", () => {
expect(
shouldFlagCompactionTimeout({
isTimeout: false,
isCompactionPendingOrRetrying: true,
isCompactionInFlight: true,
}),
).toBe(false);
});
it("uses pre-compaction snapshot when compaction timeout occurs", () => {
const pre = [{ role: "assistant", content: "pre" }] as const;
const current = [{ role: "assistant", content: "current" }] as const;
const selected = selectCompactionTimeoutSnapshot({
timedOutDuringCompaction: true,
preCompactionSnapshot: [...pre],
preCompactionSessionId: "session-pre",
currentSnapshot: [...current],
currentSessionId: "session-current",
});
expect(selected.source).toBe("pre-compaction");
expect(selected.sessionIdUsed).toBe("session-pre");
expect(selected.messagesSnapshot).toEqual(pre);
});
it("falls back to current snapshot when pre-compaction snapshot is unavailable", () => {
const current = [{ role: "assistant", content: "current" }] as const;
const selected = selectCompactionTimeoutSnapshot({
timedOutDuringCompaction: true,
preCompactionSnapshot: null,
preCompactionSessionId: "session-pre",
currentSnapshot: [...current],
currentSessionId: "session-current",
});
expect(selected.source).toBe("current");
expect(selected.sessionIdUsed).toBe("session-current");
expect(selected.messagesSnapshot).toEqual(current);
});
});

View File

@@ -0,0 +1,54 @@
import type { AgentMessage } from "@mariozechner/pi-agent-core";
export type CompactionTimeoutSignal = {
isTimeout: boolean;
isCompactionPendingOrRetrying: boolean;
isCompactionInFlight: boolean;
};
export function shouldFlagCompactionTimeout(signal: CompactionTimeoutSignal): boolean {
if (!signal.isTimeout) {
return false;
}
return signal.isCompactionPendingOrRetrying || signal.isCompactionInFlight;
}
export type SnapshotSelectionParams = {
timedOutDuringCompaction: boolean;
preCompactionSnapshot: AgentMessage[] | null;
preCompactionSessionId: string;
currentSnapshot: AgentMessage[];
currentSessionId: string;
};
export type SnapshotSelection = {
messagesSnapshot: AgentMessage[];
sessionIdUsed: string;
source: "pre-compaction" | "current";
};
export function selectCompactionTimeoutSnapshot(
params: SnapshotSelectionParams,
): SnapshotSelection {
if (!params.timedOutDuringCompaction) {
return {
messagesSnapshot: params.currentSnapshot,
sessionIdUsed: params.currentSessionId,
source: "current",
};
}
if (params.preCompactionSnapshot) {
return {
messagesSnapshot: params.preCompactionSnapshot,
sessionIdUsed: params.preCompactionSessionId,
source: "pre-compaction",
};
}
return {
messagesSnapshot: params.currentSnapshot,
sessionIdUsed: params.currentSessionId,
source: "current",
};
}

View File

@@ -24,6 +24,8 @@ export type EmbeddedRunAttemptParams = EmbeddedRunAttemptBase & {
export type EmbeddedRunAttemptResult = {
aborted: boolean;
timedOut: boolean;
/** True if the timeout occurred while compaction was in progress or pending. */
timedOutDuringCompaction: boolean;
promptError: unknown;
sessionIdUsed: string;
systemPromptReport?: SessionSystemPromptReport;