fix: handle CLI session expired errors gracefully instead of crashing gateway (#31090)

* fix: handle CLI session expired errors gracefully - Add session_expired to FailoverReason type - Add isCliSessionExpiredErrorMessage to detect expired CLI sessions - Modify runCliAgent to retry with new session when session expires - Update agentCommand to clear expired session IDs from session store - Add proper error handling to prevent gateway crashes on expired sessions Fixes #30986 * fix: add session_expired to AuthProfileFailureReason and missing log import * fix: type cli-runner usage field to match EmbeddedPiAgentMeta * fix: harden CLI session-expiry recovery handling * build: regenerate host env security policy swift --------- Co-authored-by: Peter Steinberger <steipete@gmail.com>
2026-05-19 22:45:31 +00:00 · 2026-03-02 09:11:05 +08:00
parent a95c8077e8
commit ed86252aa5
9 changed files with 481 additions and 206 deletions
--- a/apps/macos/Sources/OpenClaw/HostEnvSecurityPolicy.generated.swift
+++ b/apps/macos/Sources/OpenClaw/HostEnvSecurityPolicy.generated.swift
@@ -22,17 +22,17 @@ enum HostEnvSecurityPolicy {
        "PS4",
        "GCONV_PATH",
        "IFS",
-        "SSLKEYLOGFILE",
+        "SSLKEYLOGFILE"
    ]

    static let blockedOverrideKeys: Set<String> = [
        "HOME",
-        "ZDOTDIR",
+        "ZDOTDIR"
    ]

    static let blockedPrefixes: [String] = [
        "DYLD_",
        "LD_",
-        "BASH_FUNC_",
+        "BASH_FUNC_"
    ]
 }
--- a/src/agents/auth-profiles/types.ts
+++ b/src/agents/auth-profiles/types.ts
@@ -43,6 +43,7 @@ export type AuthProfileFailureReason =
  | "billing"
  | "timeout"
  | "model_not_found"
+  | "session_expired"
  | "unknown";

 /** Per-profile usage statistics for round-robin and cooldown tracking */
--- a/src/agents/cli-runner.test.ts
+++ b/src/agents/cli-runner.test.ts
@@ -153,6 +153,50 @@ describe("runCliAgent with process supervisor", () => {
    ).rejects.toThrow("exceeded timeout");
  });

+  it("rethrows the retry failure when session-expired recovery retry also fails", async () => {
+    supervisorSpawnMock.mockResolvedValueOnce(
+      createManagedRun({
+        reason: "exit",
+        exitCode: 1,
+        exitSignal: null,
+        durationMs: 150,
+        stdout: "",
+        stderr: "session expired",
+        timedOut: false,
+        noOutputTimedOut: false,
+      }),
+    );
+    supervisorSpawnMock.mockResolvedValueOnce(
+      createManagedRun({
+        reason: "exit",
+        exitCode: 1,
+        exitSignal: null,
+        durationMs: 150,
+        stdout: "",
+        stderr: "rate limit exceeded",
+        timedOut: false,
+        noOutputTimedOut: false,
+      }),
+    );
+
+    await expect(
+      runCliAgent({
+        sessionId: "s1",
+        sessionKey: "agent:main:subagent:retry",
+        sessionFile: "/tmp/session.jsonl",
+        workspaceDir: "/tmp",
+        prompt: "hi",
+        provider: "codex-cli",
+        model: "gpt-5.2-codex",
+        timeoutMs: 1_000,
+        runId: "run-retry-failure",
+        cliSessionId: "thread-123",
+      }),
+    ).rejects.toThrow("rate limit exceeded");
+
+    expect(supervisorSpawnMock).toHaveBeenCalledTimes(2);
+  });
+
  it("falls back to per-agent workspace when workspaceDir is missing", async () => {
    const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-cli-runner-"));
    const fallbackWorkspace = path.join(tempDir, "workspace-main");
--- a/src/agents/cli-runner.ts
+++ b/src/agents/cli-runner.ts
@@ -122,204 +122,221 @@ export async function runCliAgent(params: {
    agentId: sessionAgentId,
  });

-  const { sessionId: cliSessionIdToSend, isNew } = resolveSessionIdToSend({
-    backend,
-    cliSessionId: params.cliSessionId,
-  });
-  const useResume = Boolean(
-    params.cliSessionId &&
-    cliSessionIdToSend &&
-    backend.resumeArgs &&
-    backend.resumeArgs.length > 0,
-  );
-  const sessionIdSent = cliSessionIdToSend
-    ? useResume || Boolean(backend.sessionArg) || Boolean(backend.sessionArgs?.length)
-      ? cliSessionIdToSend
-      : undefined
-    : undefined;
-  const systemPromptArg = resolveSystemPromptUsage({
-    backend,
-    isNewSession: isNew,
-    systemPrompt,
-  });
-
-  let imagePaths: string[] | undefined;
-  let cleanupImages: (() => Promise<void>) | undefined;
-  let prompt = params.prompt;
-  if (params.images && params.images.length > 0) {
-    const imagePayload = await writeCliImages(params.images);
-    imagePaths = imagePayload.paths;
-    cleanupImages = imagePayload.cleanup;
-    if (!backend.imageArg) {
-      prompt = appendImagePathsToPrompt(prompt, imagePaths);
-    }
-  }
-
-  const { argsPrompt, stdin } = resolvePromptInput({
-    backend,
-    prompt,
-  });
-  const stdinPayload = stdin ?? "";
-  const baseArgs = useResume ? (backend.resumeArgs ?? backend.args ?? []) : (backend.args ?? []);
-  const resolvedArgs = useResume
-    ? baseArgs.map((entry) => entry.replaceAll("{sessionId}", cliSessionIdToSend ?? ""))
-    : baseArgs;
-  const args = buildCliArgs({
-    backend,
-    baseArgs: resolvedArgs,
-    modelId: normalizedModel,
-    sessionId: cliSessionIdToSend,
-    systemPrompt: systemPromptArg,
-    imagePaths,
-    promptArg: argsPrompt,
-    useResume,
-  });
-
-  const serialize = backend.serialize ?? true;
-  const queueKey = serialize ? backendResolved.id : `${backendResolved.id}:${params.runId}`;
-
-  try {
-    const output = await enqueueCliRun(queueKey, async () => {
-      log.info(
-        `cli exec: provider=${params.provider} model=${normalizedModel} promptChars=${params.prompt.length}`,
-      );
-      const logOutputText = isTruthyEnvValue(process.env.OPENCLAW_CLAUDE_CLI_LOG_OUTPUT);
-      if (logOutputText) {
-        const logArgs: string[] = [];
-        for (let i = 0; i < args.length; i += 1) {
-          const arg = args[i] ?? "";
-          if (arg === backend.systemPromptArg) {
-            const systemPromptValue = args[i + 1] ?? "";
-            logArgs.push(arg, `<systemPrompt:${systemPromptValue.length} chars>`);
-            i += 1;
-            continue;
-          }
-          if (arg === backend.sessionArg) {
-            logArgs.push(arg, args[i + 1] ?? "");
-            i += 1;
-            continue;
-          }
-          if (arg === backend.modelArg) {
-            logArgs.push(arg, args[i + 1] ?? "");
-            i += 1;
-            continue;
-          }
-          if (arg === backend.imageArg) {
-            logArgs.push(arg, "<image>");
-            i += 1;
-            continue;
-          }
-          logArgs.push(arg);
-        }
-        if (argsPrompt) {
-          const promptIndex = logArgs.indexOf(argsPrompt);
-          if (promptIndex >= 0) {
-            logArgs[promptIndex] = `<prompt:${argsPrompt.length} chars>`;
-          }
-        }
-        log.info(`cli argv: ${backend.command} ${logArgs.join(" ")}`);
-      }
-
-      const env = (() => {
-        const next = { ...process.env, ...backend.env };
-        for (const key of backend.clearEnv ?? []) {
-          delete next[key];
-        }
-        return next;
-      })();
-      const noOutputTimeoutMs = resolveCliNoOutputTimeoutMs({
-        backend,
-        timeoutMs: params.timeoutMs,
-        useResume,
-      });
-      const supervisor = getProcessSupervisor();
-      const scopeKey = buildCliSupervisorScopeKey({
-        backend,
-        backendId: backendResolved.id,
-        cliSessionId: useResume ? cliSessionIdToSend : undefined,
-      });
-
-      const managedRun = await supervisor.spawn({
-        sessionId: params.sessionId,
-        backendId: backendResolved.id,
-        scopeKey,
-        replaceExistingScope: Boolean(useResume && scopeKey),
-        mode: "child",
-        argv: [backend.command, ...args],
-        timeoutMs: params.timeoutMs,
-        noOutputTimeoutMs,
-        cwd: workspaceDir,
-        env,
-        input: stdinPayload,
-      });
-      const result = await managedRun.wait();
-
-      const stdout = result.stdout.trim();
-      const stderr = result.stderr.trim();
-      if (logOutputText) {
-        if (stdout) {
-          log.info(`cli stdout:\n${stdout}`);
-        }
-        if (stderr) {
-          log.info(`cli stderr:\n${stderr}`);
-        }
-      }
-      if (shouldLogVerbose()) {
-        if (stdout) {
-          log.debug(`cli stdout:\n${stdout}`);
-        }
-        if (stderr) {
-          log.debug(`cli stderr:\n${stderr}`);
-        }
-      }
-
-      if (result.exitCode !== 0 || result.reason !== "exit") {
-        if (result.reason === "no-output-timeout" || result.noOutputTimedOut) {
-          const timeoutReason = `CLI produced no output for ${Math.round(noOutputTimeoutMs / 1000)}s and was terminated.`;
-          log.warn(
-            `cli watchdog timeout: provider=${params.provider} model=${modelId} session=${cliSessionIdToSend ?? params.sessionId} noOutputTimeoutMs=${noOutputTimeoutMs} pid=${managedRun.pid ?? "unknown"}`,
-          );
-          throw new FailoverError(timeoutReason, {
-            reason: "timeout",
-            provider: params.provider,
-            model: modelId,
-            status: resolveFailoverStatus("timeout"),
-          });
-        }
-        if (result.reason === "overall-timeout") {
-          const timeoutReason = `CLI exceeded timeout (${Math.round(params.timeoutMs / 1000)}s) and was terminated.`;
-          throw new FailoverError(timeoutReason, {
-            reason: "timeout",
-            provider: params.provider,
-            model: modelId,
-            status: resolveFailoverStatus("timeout"),
-          });
-        }
-        const err = stderr || stdout || "CLI failed.";
-        const reason = classifyFailoverReason(err) ?? "unknown";
-        const status = resolveFailoverStatus(reason);
-        throw new FailoverError(err, {
-          reason,
-          provider: params.provider,
-          model: modelId,
-          status,
-        });
-      }
-
-      const outputMode = useResume ? (backend.resumeOutput ?? backend.output) : backend.output;
-
-      if (outputMode === "text") {
-        return { text: stdout, sessionId: undefined };
-      }
-      if (outputMode === "jsonl") {
-        const parsed = parseCliJsonl(stdout, backend);
-        return parsed ?? { text: stdout };
-      }
-
-      const parsed = parseCliJson(stdout, backend);
-      return parsed ?? { text: stdout };
+  // Helper function to execute CLI with given session ID
+  const executeCliWithSession = async (
+    cliSessionIdToUse?: string,
+  ): Promise<{
+    text: string;
+    sessionId?: string;
+    usage?: {
+      input?: number;
+      output?: number;
+      cacheRead?: number;
+      cacheWrite?: number;
+      total?: number;
+    };
+  }> => {
+    const { sessionId: resolvedSessionId, isNew } = resolveSessionIdToSend({
+      backend,
+      cliSessionId: cliSessionIdToUse,
+    });
+    const useResume = Boolean(
+      cliSessionIdToUse && resolvedSessionId && backend.resumeArgs && backend.resumeArgs.length > 0,
+    );
+    const systemPromptArg = resolveSystemPromptUsage({
+      backend,
+      isNewSession: isNew,
+      systemPrompt,
    });

+    let imagePaths: string[] | undefined;
+    let cleanupImages: (() => Promise<void>) | undefined;
+    let prompt = params.prompt;
+    if (params.images && params.images.length > 0) {
+      const imagePayload = await writeCliImages(params.images);
+      imagePaths = imagePayload.paths;
+      cleanupImages = imagePayload.cleanup;
+      if (!backend.imageArg) {
+        prompt = appendImagePathsToPrompt(prompt, imagePaths);
+      }
+    }
+
+    const { argsPrompt, stdin } = resolvePromptInput({
+      backend,
+      prompt,
+    });
+    const stdinPayload = stdin ?? "";
+    const baseArgs = useResume ? (backend.resumeArgs ?? backend.args ?? []) : (backend.args ?? []);
+    const resolvedArgs = useResume
+      ? baseArgs.map((entry) => entry.replaceAll("{sessionId}", resolvedSessionId ?? ""))
+      : baseArgs;
+    const args = buildCliArgs({
+      backend,
+      baseArgs: resolvedArgs,
+      modelId: normalizedModel,
+      sessionId: resolvedSessionId,
+      systemPrompt: systemPromptArg,
+      imagePaths,
+      promptArg: argsPrompt,
+      useResume,
+    });
+
+    const serialize = backend.serialize ?? true;
+    const queueKey = serialize ? backendResolved.id : `${backendResolved.id}:${params.runId}`;
+
+    try {
+      const output = await enqueueCliRun(queueKey, async () => {
+        log.info(
+          `cli exec: provider=${params.provider} model=${normalizedModel} promptChars=${params.prompt.length}`,
+        );
+        const logOutputText = isTruthyEnvValue(process.env.OPENCLAW_CLAUDE_CLI_LOG_OUTPUT);
+        if (logOutputText) {
+          const logArgs: string[] = [];
+          for (let i = 0; i < args.length; i += 1) {
+            const arg = args[i] ?? "";
+            if (arg === backend.systemPromptArg) {
+              const systemPromptValue = args[i + 1] ?? "";
+              logArgs.push(arg, `<systemPrompt:${systemPromptValue.length} chars>`);
+              i += 1;
+              continue;
+            }
+            if (arg === backend.sessionArg) {
+              logArgs.push(arg, args[i + 1] ?? "");
+              i += 1;
+              continue;
+            }
+            if (arg === backend.modelArg) {
+              logArgs.push(arg, args[i + 1] ?? "");
+              i += 1;
+              continue;
+            }
+            if (arg === backend.imageArg) {
+              logArgs.push(arg, "<image>");
+              i += 1;
+              continue;
+            }
+            logArgs.push(arg);
+          }
+          if (argsPrompt) {
+            const promptIndex = logArgs.indexOf(argsPrompt);
+            if (promptIndex >= 0) {
+              logArgs[promptIndex] = `<prompt:${argsPrompt.length} chars>`;
+            }
+          }
+          log.info(`cli argv: ${backend.command} ${logArgs.join(" ")}`);
+        }
+
+        const env = (() => {
+          const next = { ...process.env, ...backend.env };
+          for (const key of backend.clearEnv ?? []) {
+            delete next[key];
+          }
+          return next;
+        })();
+        const noOutputTimeoutMs = resolveCliNoOutputTimeoutMs({
+          backend,
+          timeoutMs: params.timeoutMs,
+          useResume,
+        });
+        const supervisor = getProcessSupervisor();
+        const scopeKey = buildCliSupervisorScopeKey({
+          backend,
+          backendId: backendResolved.id,
+          cliSessionId: useResume ? resolvedSessionId : undefined,
+        });
+
+        const managedRun = await supervisor.spawn({
+          sessionId: params.sessionId,
+          backendId: backendResolved.id,
+          scopeKey,
+          replaceExistingScope: Boolean(useResume && scopeKey),
+          mode: "child",
+          argv: [backend.command, ...args],
+          timeoutMs: params.timeoutMs,
+          noOutputTimeoutMs,
+          cwd: workspaceDir,
+          env,
+          input: stdinPayload,
+        });
+        const result = await managedRun.wait();
+
+        const stdout = result.stdout.trim();
+        const stderr = result.stderr.trim();
+        if (logOutputText) {
+          if (stdout) {
+            log.info(`cli stdout:\n${stdout}`);
+          }
+          if (stderr) {
+            log.info(`cli stderr:\n${stderr}`);
+          }
+        }
+        if (shouldLogVerbose()) {
+          if (stdout) {
+            log.debug(`cli stdout:\n${stdout}`);
+          }
+          if (stderr) {
+            log.debug(`cli stderr:\n${stderr}`);
+          }
+        }
+
+        if (result.exitCode !== 0 || result.reason !== "exit") {
+          if (result.reason === "no-output-timeout" || result.noOutputTimedOut) {
+            const timeoutReason = `CLI produced no output for ${Math.round(noOutputTimeoutMs / 1000)}s and was terminated.`;
+            log.warn(
+              `cli watchdog timeout: provider=${params.provider} model=${modelId} session=${resolvedSessionId ?? params.sessionId} noOutputTimeoutMs=${noOutputTimeoutMs} pid=${managedRun.pid ?? "unknown"}`,
+            );
+            throw new FailoverError(timeoutReason, {
+              reason: "timeout",
+              provider: params.provider,
+              model: modelId,
+              status: resolveFailoverStatus("timeout"),
+            });
+          }
+          if (result.reason === "overall-timeout") {
+            const timeoutReason = `CLI exceeded timeout (${Math.round(params.timeoutMs / 1000)}s) and was terminated.`;
+            throw new FailoverError(timeoutReason, {
+              reason: "timeout",
+              provider: params.provider,
+              model: modelId,
+              status: resolveFailoverStatus("timeout"),
+            });
+          }
+          const err = stderr || stdout || "CLI failed.";
+          const reason = classifyFailoverReason(err) ?? "unknown";
+          const status = resolveFailoverStatus(reason);
+          throw new FailoverError(err, {
+            reason,
+            provider: params.provider,
+            model: modelId,
+            status,
+          });
+        }
+
+        const outputMode = useResume ? (backend.resumeOutput ?? backend.output) : backend.output;
+
+        if (outputMode === "text") {
+          return { text: stdout, sessionId: undefined };
+        }
+        if (outputMode === "jsonl") {
+          const parsed = parseCliJsonl(stdout, backend);
+          return parsed ?? { text: stdout };
+        }
+
+        const parsed = parseCliJson(stdout, backend);
+        return parsed ?? { text: stdout };
+      });
+
+      return output;
+    } finally {
+      if (cleanupImages) {
+        await cleanupImages();
+      }
+    }
+  };
+
+  // Try with the provided CLI session ID first
+  try {
+    const output = await executeCliWithSession(params.cliSessionId);
    const text = output.text?.trim();
    const payloads = text ? [{ text }] : undefined;

@@ -328,7 +345,7 @@ export async function runCliAgent(params: {
      meta: {
        durationMs: Date.now() - started,
        agentMeta: {
-          sessionId: output.sessionId ?? sessionIdSent ?? params.sessionId ?? "",
+          sessionId: output.sessionId ?? params.cliSessionId ?? params.sessionId ?? "",
          provider: params.provider,
          model: modelId,
          usage: output.usage,
@@ -337,6 +354,34 @@ export async function runCliAgent(params: {
    };
  } catch (err) {
    if (err instanceof FailoverError) {
+      // Check if this is a session expired error and we have a session to clear
+      if (err.reason === "session_expired" && params.cliSessionId && params.sessionKey) {
+        log.warn(
+          `CLI session expired, clearing session ID and retrying: provider=${params.provider} session=${redactRunIdentifier(params.cliSessionId)}`,
+        );
+
+        // Clear the expired session ID from the session entry
+        // This requires access to the session store, which we don't have here
+        // We'll need to modify the caller to handle this case
+
+        // For now, retry without the session ID to create a new session
+        const output = await executeCliWithSession(undefined);
+        const text = output.text?.trim();
+        const payloads = text ? [{ text }] : undefined;
+
+        return {
+          payloads,
+          meta: {
+            durationMs: Date.now() - started,
+            agentMeta: {
+              sessionId: output.sessionId ?? params.sessionId ?? "",
+              provider: params.provider,
+              model: modelId,
+              usage: output.usage,
+            },
+          },
+        };
+      }
      throw err;
    }
    const message = err instanceof Error ? err.message : String(err);
@@ -351,10 +396,6 @@ export async function runCliAgent(params: {
      });
    }
    throw err;
-  } finally {
-    if (cleanupImages) {
-      await cleanupImages();
-    }
  }
 }

--- a/src/agents/failover-error.ts
+++ b/src/agents/failover-error.ts
@@ -59,6 +59,8 @@ export function resolveFailoverStatus(reason: FailoverReason): number | undefine
      return 400;
    case "model_not_found":
      return 404;
+    case "session_expired":
+      return 410; // Gone - session no longer exists
    default:
      return undefined;
  }
--- a/src/agents/pi-embedded-helpers/errors.ts
+++ b/src/agents/pi-embedded-helpers/errors.ts
@@ -883,6 +883,27 @@ export function isModelNotFoundErrorMessage(raw: string): boolean {
  return false;
 }

+function isCliSessionExpiredErrorMessage(raw: string): boolean {
+  if (!raw) {
+    return false;
+  }
+  const lower = raw.toLowerCase();
+  return (
+    lower.includes("session not found") ||
+    lower.includes("session does not exist") ||
+    lower.includes("session expired") ||
+    lower.includes("session invalid") ||
+    lower.includes("conversation not found") ||
+    lower.includes("conversation does not exist") ||
+    lower.includes("conversation expired") ||
+    lower.includes("conversation invalid") ||
+    lower.includes("no such session") ||
+    lower.includes("invalid session") ||
+    lower.includes("session id not found") ||
+    lower.includes("conversation id not found")
+  );
+}
+
 export function classifyFailoverReason(raw: string): FailoverReason | null {
  if (isImageDimensionErrorMessage(raw)) {
    return null;
@@ -890,6 +911,9 @@ export function classifyFailoverReason(raw: string): FailoverReason | null {
  if (isImageSizeError(raw)) {
    return null;
  }
+  if (isCliSessionExpiredErrorMessage(raw)) {
+    return "session_expired";
+  }
  if (isModelNotFoundErrorMessage(raw)) {
    return "model_not_found";
  }
--- a/src/agents/pi-embedded-helpers/types.ts
+++ b/src/agents/pi-embedded-helpers/types.ts
@@ -8,4 +8,5 @@ export type FailoverReason =
  | "billing"
  | "timeout"
  | "model_not_found"
+  | "session_expired"
  | "unknown";
--- a/src/commands/agent.test.ts
+++ b/src/commands/agent.test.ts
@@ -4,7 +4,9 @@ import { beforeEach, describe, expect, it, type MockInstance, vi } from "vitest"
 import { withTempHome as withTempHomeBase } from "../../test/helpers/temp-home.js";
 import "../cron/isolated-agent.mocks.js";
 import * as cliRunnerModule from "../agents/cli-runner.js";
+import { FailoverError } from "../agents/failover-error.js";
 import { loadModelCatalog } from "../agents/model-catalog.js";
+import * as modelSelectionModule from "../agents/model-selection.js";
 import { runEmbeddedPiAgent } from "../agents/pi-embedded.js";
 import type { OpenClawConfig } from "../config/config.js";
 import * as configModule from "../config/config.js";
@@ -148,6 +150,7 @@ beforeEach(() => {
    },
  });
  vi.mocked(loadModelCatalog).mockResolvedValue([]);
+  vi.mocked(modelSelectionModule.isCliProvider).mockImplementation(() => false);
 });

 describe("agentCommand", () => {
@@ -640,6 +643,66 @@ describe("agentCommand", () => {
    });
  });

+  it("clears stale Claude CLI legacy session IDs before retrying after session expiration", async () => {
+    vi.mocked(modelSelectionModule.isCliProvider).mockImplementation(
+      (provider) => provider.trim().toLowerCase() === "claude-cli",
+    );
+    try {
+      await withTempHome(async (home) => {
+        const store = path.join(home, "sessions.json");
+        const sessionKey = "agent:main:subagent:cli-expired";
+        writeSessionStoreSeed(store, {
+          [sessionKey]: {
+            sessionId: "session-cli-123",
+            updatedAt: Date.now(),
+            providerOverride: "claude-cli",
+            modelOverride: "opus",
+            cliSessionIds: { "claude-cli": "stale-cli-session" },
+            claudeCliSessionId: "stale-legacy-session",
+          },
+        });
+        mockConfig(home, store, {
+          model: { primary: "claude-cli/opus", fallbacks: [] },
+          models: { "claude-cli/opus": {} },
+        });
+        runCliAgentSpy
+          .mockRejectedValueOnce(
+            new FailoverError("session expired", {
+              reason: "session_expired",
+              provider: "claude-cli",
+              model: "opus",
+              status: 410,
+            }),
+          )
+          .mockRejectedValue(new Error("retry failed"));
+
+        await expect(agentCommand({ message: "hi", sessionKey }, runtime)).rejects.toThrow(
+          "retry failed",
+        );
+
+        expect(runCliAgentSpy).toHaveBeenCalledTimes(2);
+        const firstCall = runCliAgentSpy.mock.calls[0]?.[0] as
+          | { cliSessionId?: string }
+          | undefined;
+        const secondCall = runCliAgentSpy.mock.calls[1]?.[0] as
+          | { cliSessionId?: string }
+          | undefined;
+        expect(firstCall?.cliSessionId).toBe("stale-cli-session");
+        expect(secondCall?.cliSessionId).toBeUndefined();
+
+        const saved = JSON.parse(fs.readFileSync(store, "utf-8")) as Record<
+          string,
+          { cliSessionIds?: Record<string, string>; claudeCliSessionId?: string }
+        >;
+        const entry = saved[sessionKey];
+        expect(entry?.cliSessionIds?.["claude-cli"]).toBeUndefined();
+        expect(entry?.claudeCliSessionId).toBeUndefined();
+      });
+    } finally {
+      vi.mocked(modelSelectionModule.isCliProvider).mockImplementation(() => false);
+    }
+  });
+
  it("rejects unknown agent overrides", async () => {
    await withTempHome(async (home) => {
      const store = path.join(home, "sessions.json");
--- a/src/commands/agent.ts
+++ b/src/commands/agent.ts
@@ -1,6 +1,9 @@
 import { getAcpSessionManager } from "../acp/control-plane/manager.js";
 import { resolveAcpAgentPolicyError, resolveAcpDispatchPolicyError } from "../acp/policy.js";
 import { toAcpRuntimeError } from "../acp/runtime/errors.js";
+import { createSubsystemLogger } from "../logging/subsystem.js";
+
+const log = createSubsystemLogger("commands/agent");
 import {
  listAgentIds,
  resolveAgentDir,
@@ -12,8 +15,9 @@ import {
 import { ensureAuthProfileStore } from "../agents/auth-profiles.js";
 import { clearSessionAuthProfileOverride } from "../agents/auth-profiles/session-override.js";
 import { runCliAgent } from "../agents/cli-runner.js";
-import { getCliSessionId } from "../agents/cli-session.js";
+import { getCliSessionId, setCliSessionId } from "../agents/cli-session.js";
 import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../agents/defaults.js";
+import { FailoverError } from "../agents/failover-error.js";
 import { formatAgentInternalEventsForPrompt } from "../agents/internal-events.js";
 import { AGENT_LANE_SUBAGENT } from "../agents/lanes.js";
 import { loadModelCatalog } from "../agents/model-catalog.js";
@@ -23,6 +27,7 @@ import {
  isCliProvider,
  modelKey,
  normalizeModelRef,
+  normalizeProviderId,
  resolveConfiguredModelRef,
  resolveDefaultModelForAgent,
  resolveThinkingDefault,
@@ -89,7 +94,8 @@ type OverrideFieldClearedByDelete =
  | "authProfileOverrideCompactionCount"
  | "fallbackNoticeSelectedModel"
  | "fallbackNoticeActiveModel"
-  | "fallbackNoticeReason";
+  | "fallbackNoticeReason"
+  | "claudeCliSessionId";

 const OVERRIDE_FIELDS_CLEARED_BY_DELETE: OverrideFieldClearedByDelete[] = [
  "providerOverride",
@@ -100,6 +106,7 @@ const OVERRIDE_FIELDS_CLEARED_BY_DELETE: OverrideFieldClearedByDelete[] = [
  "fallbackNoticeSelectedModel",
  "fallbackNoticeActiveModel",
  "fallbackNoticeReason",
+  "claudeCliSessionId",
 ];

 async function persistSessionEntry(params: PersistSessionEntryParams): Promise<void> {
@@ -162,6 +169,8 @@ function runAgentAttempt(params: {
  agentDir: string;
  onAgentEvent: (evt: { stream: string; data?: Record<string, unknown> }) => void;
  primaryProvider: string;
+  sessionStore?: Record<string, SessionEntry>;
+  storePath?: string;
 }) {
  const senderIsOwner = params.opts.senderIsOwner ?? true;
  const effectivePrompt = resolveFallbackRetryPrompt({
@@ -187,6 +196,94 @@ function runAgentAttempt(params: {
      cliSessionId,
      images: params.isFallbackRetry ? undefined : params.opts.images,
      streamParams: params.opts.streamParams,
+    }).catch(async (err) => {
+      // Handle CLI session expired error
+      if (
+        err instanceof FailoverError &&
+        err.reason === "session_expired" &&
+        cliSessionId &&
+        params.sessionKey &&
+        params.sessionStore &&
+        params.storePath
+      ) {
+        log.warn(
+          `CLI session expired, clearing from session store: provider=${params.providerOverride} sessionKey=${params.sessionKey}`,
+        );
+
+        // Clear the expired session ID from the session store
+        const entry = params.sessionStore[params.sessionKey];
+        if (entry) {
+          const updatedEntry = { ...entry };
+          if (params.providerOverride === "claude-cli") {
+            delete updatedEntry.claudeCliSessionId;
+          }
+          if (updatedEntry.cliSessionIds) {
+            const normalizedProvider = normalizeProviderId(params.providerOverride);
+            const newCliSessionIds = { ...updatedEntry.cliSessionIds };
+            delete newCliSessionIds[normalizedProvider];
+            updatedEntry.cliSessionIds = newCliSessionIds;
+          }
+          updatedEntry.updatedAt = Date.now();
+
+          await persistSessionEntry({
+            sessionStore: params.sessionStore,
+            sessionKey: params.sessionKey,
+            storePath: params.storePath,
+            entry: updatedEntry,
+          });
+
+          // Update the session entry reference
+          params.sessionEntry = updatedEntry;
+        }
+
+        // Retry with no session ID (will create a new session)
+        return runCliAgent({
+          sessionId: params.sessionId,
+          sessionKey: params.sessionKey,
+          agentId: params.sessionAgentId,
+          sessionFile: params.sessionFile,
+          workspaceDir: params.workspaceDir,
+          config: params.cfg,
+          prompt: effectivePrompt,
+          provider: params.providerOverride,
+          model: params.modelOverride,
+          thinkLevel: params.resolvedThinkLevel,
+          timeoutMs: params.timeoutMs,
+          runId: params.runId,
+          extraSystemPrompt: params.opts.extraSystemPrompt,
+          cliSessionId: undefined, // No session ID to force new session
+          images: params.isFallbackRetry ? undefined : params.opts.images,
+          streamParams: params.opts.streamParams,
+        }).then(async (result) => {
+          // Update session store with new CLI session ID if available
+          if (
+            result.meta.agentMeta?.sessionId &&
+            params.sessionKey &&
+            params.sessionStore &&
+            params.storePath
+          ) {
+            const entry = params.sessionStore[params.sessionKey];
+            if (entry) {
+              const updatedEntry = { ...entry };
+              setCliSessionId(
+                updatedEntry,
+                params.providerOverride,
+                result.meta.agentMeta.sessionId,
+              );
+              updatedEntry.updatedAt = Date.now();
+
+              await persistSessionEntry({
+                sessionStore: params.sessionStore,
+                sessionKey: params.sessionKey,
+                storePath: params.storePath,
+                entry: updatedEntry,
+              });
+            }
+          }
+          return result;
+        });
+      }
+      throw err;
    });
  }

@@ -766,6 +863,8 @@ export async function agentCommand(
            resolvedVerboseLevel,
            agentDir,
            primaryProvider: provider,
+            sessionStore,
+            storePath,
            onAgentEvent: (evt) => {
              // Track lifecycle end for fallback emission below.
              if (