fix: handle CLI session expired errors gracefully instead of crashing gateway (#31090)

* fix: handle CLI session expired errors gracefully

- Add session_expired to FailoverReason type
- Add isCliSessionExpiredErrorMessage to detect expired CLI sessions
- Modify runCliAgent to retry with new session when session expires
- Update agentCommand to clear expired session IDs from session store
- Add proper error handling to prevent gateway crashes on expired sessions

Fixes #30986

* fix: add session_expired to AuthProfileFailureReason and missing log import

* fix: type cli-runner usage field to match EmbeddedPiAgentMeta

* fix: harden CLI session-expiry recovery handling

* build: regenerate host env security policy swift

---------

Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
Frank Yang
2026-03-02 09:11:05 +08:00
committed by GitHub
parent a95c8077e8
commit ed86252aa5
9 changed files with 481 additions and 206 deletions

View File

@@ -4,7 +4,9 @@ import { beforeEach, describe, expect, it, type MockInstance, vi } from "vitest"
import { withTempHome as withTempHomeBase } from "../../test/helpers/temp-home.js";
import "../cron/isolated-agent.mocks.js";
import * as cliRunnerModule from "../agents/cli-runner.js";
import { FailoverError } from "../agents/failover-error.js";
import { loadModelCatalog } from "../agents/model-catalog.js";
import * as modelSelectionModule from "../agents/model-selection.js";
import { runEmbeddedPiAgent } from "../agents/pi-embedded.js";
import type { OpenClawConfig } from "../config/config.js";
import * as configModule from "../config/config.js";
@@ -148,6 +150,7 @@ beforeEach(() => {
},
});
vi.mocked(loadModelCatalog).mockResolvedValue([]);
vi.mocked(modelSelectionModule.isCliProvider).mockImplementation(() => false);
});
describe("agentCommand", () => {
@@ -640,6 +643,66 @@ describe("agentCommand", () => {
});
});
it("clears stale Claude CLI legacy session IDs before retrying after session expiration", async () => {
vi.mocked(modelSelectionModule.isCliProvider).mockImplementation(
(provider) => provider.trim().toLowerCase() === "claude-cli",
);
try {
await withTempHome(async (home) => {
const store = path.join(home, "sessions.json");
const sessionKey = "agent:main:subagent:cli-expired";
writeSessionStoreSeed(store, {
[sessionKey]: {
sessionId: "session-cli-123",
updatedAt: Date.now(),
providerOverride: "claude-cli",
modelOverride: "opus",
cliSessionIds: { "claude-cli": "stale-cli-session" },
claudeCliSessionId: "stale-legacy-session",
},
});
mockConfig(home, store, {
model: { primary: "claude-cli/opus", fallbacks: [] },
models: { "claude-cli/opus": {} },
});
runCliAgentSpy
.mockRejectedValueOnce(
new FailoverError("session expired", {
reason: "session_expired",
provider: "claude-cli",
model: "opus",
status: 410,
}),
)
.mockRejectedValue(new Error("retry failed"));
await expect(agentCommand({ message: "hi", sessionKey }, runtime)).rejects.toThrow(
"retry failed",
);
expect(runCliAgentSpy).toHaveBeenCalledTimes(2);
const firstCall = runCliAgentSpy.mock.calls[0]?.[0] as
| { cliSessionId?: string }
| undefined;
const secondCall = runCliAgentSpy.mock.calls[1]?.[0] as
| { cliSessionId?: string }
| undefined;
expect(firstCall?.cliSessionId).toBe("stale-cli-session");
expect(secondCall?.cliSessionId).toBeUndefined();
const saved = JSON.parse(fs.readFileSync(store, "utf-8")) as Record<
string,
{ cliSessionIds?: Record<string, string>; claudeCliSessionId?: string }
>;
const entry = saved[sessionKey];
expect(entry?.cliSessionIds?.["claude-cli"]).toBeUndefined();
expect(entry?.claudeCliSessionId).toBeUndefined();
});
} finally {
vi.mocked(modelSelectionModule.isCliProvider).mockImplementation(() => false);
}
});
it("rejects unknown agent overrides", async () => {
await withTempHome(async (home) => {
const store = path.join(home, "sessions.json");

View File

@@ -1,6 +1,9 @@
import { getAcpSessionManager } from "../acp/control-plane/manager.js";
import { resolveAcpAgentPolicyError, resolveAcpDispatchPolicyError } from "../acp/policy.js";
import { toAcpRuntimeError } from "../acp/runtime/errors.js";
import { createSubsystemLogger } from "../logging/subsystem.js";
const log = createSubsystemLogger("commands/agent");
import {
listAgentIds,
resolveAgentDir,
@@ -12,8 +15,9 @@ import {
import { ensureAuthProfileStore } from "../agents/auth-profiles.js";
import { clearSessionAuthProfileOverride } from "../agents/auth-profiles/session-override.js";
import { runCliAgent } from "../agents/cli-runner.js";
import { getCliSessionId } from "../agents/cli-session.js";
import { getCliSessionId, setCliSessionId } from "../agents/cli-session.js";
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../agents/defaults.js";
import { FailoverError } from "../agents/failover-error.js";
import { formatAgentInternalEventsForPrompt } from "../agents/internal-events.js";
import { AGENT_LANE_SUBAGENT } from "../agents/lanes.js";
import { loadModelCatalog } from "../agents/model-catalog.js";
@@ -23,6 +27,7 @@ import {
isCliProvider,
modelKey,
normalizeModelRef,
normalizeProviderId,
resolveConfiguredModelRef,
resolveDefaultModelForAgent,
resolveThinkingDefault,
@@ -89,7 +94,8 @@ type OverrideFieldClearedByDelete =
| "authProfileOverrideCompactionCount"
| "fallbackNoticeSelectedModel"
| "fallbackNoticeActiveModel"
| "fallbackNoticeReason";
| "fallbackNoticeReason"
| "claudeCliSessionId";
const OVERRIDE_FIELDS_CLEARED_BY_DELETE: OverrideFieldClearedByDelete[] = [
"providerOverride",
@@ -100,6 +106,7 @@ const OVERRIDE_FIELDS_CLEARED_BY_DELETE: OverrideFieldClearedByDelete[] = [
"fallbackNoticeSelectedModel",
"fallbackNoticeActiveModel",
"fallbackNoticeReason",
"claudeCliSessionId",
];
async function persistSessionEntry(params: PersistSessionEntryParams): Promise<void> {
@@ -162,6 +169,8 @@ function runAgentAttempt(params: {
agentDir: string;
onAgentEvent: (evt: { stream: string; data?: Record<string, unknown> }) => void;
primaryProvider: string;
sessionStore?: Record<string, SessionEntry>;
storePath?: string;
}) {
const senderIsOwner = params.opts.senderIsOwner ?? true;
const effectivePrompt = resolveFallbackRetryPrompt({
@@ -187,6 +196,94 @@ function runAgentAttempt(params: {
cliSessionId,
images: params.isFallbackRetry ? undefined : params.opts.images,
streamParams: params.opts.streamParams,
}).catch(async (err) => {
// Handle CLI session expired error
if (
err instanceof FailoverError &&
err.reason === "session_expired" &&
cliSessionId &&
params.sessionKey &&
params.sessionStore &&
params.storePath
) {
log.warn(
`CLI session expired, clearing from session store: provider=${params.providerOverride} sessionKey=${params.sessionKey}`,
);
// Clear the expired session ID from the session store
const entry = params.sessionStore[params.sessionKey];
if (entry) {
const updatedEntry = { ...entry };
if (params.providerOverride === "claude-cli") {
delete updatedEntry.claudeCliSessionId;
}
if (updatedEntry.cliSessionIds) {
const normalizedProvider = normalizeProviderId(params.providerOverride);
const newCliSessionIds = { ...updatedEntry.cliSessionIds };
delete newCliSessionIds[normalizedProvider];
updatedEntry.cliSessionIds = newCliSessionIds;
}
updatedEntry.updatedAt = Date.now();
await persistSessionEntry({
sessionStore: params.sessionStore,
sessionKey: params.sessionKey,
storePath: params.storePath,
entry: updatedEntry,
});
// Update the session entry reference
params.sessionEntry = updatedEntry;
}
// Retry with no session ID (will create a new session)
return runCliAgent({
sessionId: params.sessionId,
sessionKey: params.sessionKey,
agentId: params.sessionAgentId,
sessionFile: params.sessionFile,
workspaceDir: params.workspaceDir,
config: params.cfg,
prompt: effectivePrompt,
provider: params.providerOverride,
model: params.modelOverride,
thinkLevel: params.resolvedThinkLevel,
timeoutMs: params.timeoutMs,
runId: params.runId,
extraSystemPrompt: params.opts.extraSystemPrompt,
cliSessionId: undefined, // No session ID to force new session
images: params.isFallbackRetry ? undefined : params.opts.images,
streamParams: params.opts.streamParams,
}).then(async (result) => {
// Update session store with new CLI session ID if available
if (
result.meta.agentMeta?.sessionId &&
params.sessionKey &&
params.sessionStore &&
params.storePath
) {
const entry = params.sessionStore[params.sessionKey];
if (entry) {
const updatedEntry = { ...entry };
setCliSessionId(
updatedEntry,
params.providerOverride,
result.meta.agentMeta.sessionId,
);
updatedEntry.updatedAt = Date.now();
await persistSessionEntry({
sessionStore: params.sessionStore,
sessionKey: params.sessionKey,
storePath: params.storePath,
entry: updatedEntry,
});
}
}
return result;
});
}
throw err;
});
}
@@ -766,6 +863,8 @@ export async function agentCommand(
resolvedVerboseLevel,
agentDir,
primaryProvider: provider,
sessionStore,
storePath,
onAgentEvent: (evt) => {
// Track lifecycle end for fallback emission below.
if (