Agents: add embedded error observations (#41336)

Merged via squash. Prepared head SHA: 4900042298 Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com> Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com> Reviewed-by: @altaywtf
2026-05-19 14:35:30 +00:00 · 2026-03-09 22:27:05 +03:00
parent d4e59a3666
commit 87d939be79
9 changed files with 634 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ Docs: https://docs.openclaw.ai
 - ACP/sessions.patch: allow `spawnedBy` and `spawnDepth` lineage fields on ACP session keys so `sessions_spawn` with `runtime: "acp"` no longer fails during child-session setup. Fixes #40971. (#40995) thanks @xaeon2026.
 - ACP/stop reason mapping: resolve gateway chat `state: "error"` completions as ACP `end_turn` instead of `refusal` so transient backend failures are not surfaced as deliberate refusals. (#41187) thanks @pejmanjohn.
 - ACP/setSessionMode: propagate gateway `sessions.patch` failures back to ACP clients so rejected mode changes no longer return silent success. (#41185) thanks @pejmanjohn.
+- Agents/embedded logs: add structured, sanitized lifecycle and failover observation events so overload and provider failures are easier to tail and filter. (#41336) thanks @altaywtf.

 ## 2026.3.8

--- a/src/agents/pi-embedded-error-observation.test.ts
+++ b/src/agents/pi-embedded-error-observation.test.ts
@@ -0,0 +1,182 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+import * as loggingConfigModule from "../logging/config.js";
+import {
+  buildApiErrorObservationFields,
+  buildTextObservationFields,
+  sanitizeForConsole,
+} from "./pi-embedded-error-observation.js";
+
+afterEach(() => {
+  vi.restoreAllMocks();
+});
+
+describe("buildApiErrorObservationFields", () => {
+  it("redacts request ids and exposes stable hashes instead of raw payloads", () => {
+    const observed = buildApiErrorObservationFields(
+      '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"},"request_id":"req_overload"}',
+    );
+
+    expect(observed).toMatchObject({
+      rawErrorPreview: expect.stringContaining('"request_id":"sha256:'),
+      rawErrorHash: expect.stringMatching(/^sha256:/),
+      rawErrorFingerprint: expect.stringMatching(/^sha256:/),
+      providerErrorType: "overloaded_error",
+      providerErrorMessagePreview: "Overloaded",
+      requestIdHash: expect.stringMatching(/^sha256:/),
+    });
+    expect(observed.rawErrorPreview).not.toContain("req_overload");
+  });
+
+  it("forces token redaction for observation previews", () => {
+    const observed = buildApiErrorObservationFields(
+      "Authorization: Bearer sk-abcdefghijklmnopqrstuvwxyz123456",
+    );
+
+    expect(observed.rawErrorPreview).not.toContain("sk-abcdefghijklmnopqrstuvwxyz123456");
+    expect(observed.rawErrorPreview).toContain("sk-abc");
+    expect(observed.rawErrorHash).toMatch(/^sha256:/);
+  });
+
+  it("redacts observation-only header and cookie formats", () => {
+    const observed = buildApiErrorObservationFields(
+      "x-api-key: sk-abcdefghijklmnopqrstuvwxyz123456 Cookie: session=abcdefghijklmnopqrstuvwxyz123456",
+    );
+
+    expect(observed.rawErrorPreview).not.toContain("abcdefghijklmnopqrstuvwxyz123456");
+    expect(observed.rawErrorPreview).toContain("x-api-key: ***");
+    expect(observed.rawErrorPreview).toContain("Cookie: session=");
+  });
+
+  it("does not let cookie redaction consume unrelated fields on the same line", () => {
+    const observed = buildApiErrorObservationFields(
+      "Cookie: session=abcdefghijklmnopqrstuvwxyz123456 status=503 request_id=req_cookie",
+    );
+
+    expect(observed.rawErrorPreview).toContain("Cookie: session=");
+    expect(observed.rawErrorPreview).toContain("status=503");
+    expect(observed.rawErrorPreview).toContain("request_id=sha256:");
+  });
+
+  it("builds sanitized generic text observation fields", () => {
+    const observed = buildTextObservationFields(
+      '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"},"request_id":"req_prev"}',
+    );
+
+    expect(observed).toMatchObject({
+      textPreview: expect.stringContaining('"request_id":"sha256:'),
+      textHash: expect.stringMatching(/^sha256:/),
+      textFingerprint: expect.stringMatching(/^sha256:/),
+      providerErrorType: "overloaded_error",
+      providerErrorMessagePreview: "Overloaded",
+      requestIdHash: expect.stringMatching(/^sha256:/),
+    });
+    expect(observed.textPreview).not.toContain("req_prev");
+  });
+
+  it("redacts request ids in formatted plain-text errors", () => {
+    const observed = buildApiErrorObservationFields(
+      "LLM error overloaded_error: Overloaded (request_id: req_plaintext_123)",
+    );
+
+    expect(observed).toMatchObject({
+      rawErrorPreview: expect.stringContaining("request_id: sha256:"),
+      rawErrorFingerprint: expect.stringMatching(/^sha256:/),
+      requestIdHash: expect.stringMatching(/^sha256:/),
+    });
+    expect(observed.rawErrorPreview).not.toContain("req_plaintext_123");
+  });
+
+  it("keeps fingerprints stable across request ids for equivalent errors", () => {
+    const first = buildApiErrorObservationFields(
+      '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"},"request_id":"req_001"}',
+    );
+    const second = buildApiErrorObservationFields(
+      '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"},"request_id":"req_002"}',
+    );
+
+    expect(first.rawErrorFingerprint).toBe(second.rawErrorFingerprint);
+    expect(first.rawErrorHash).not.toBe(second.rawErrorHash);
+  });
+
+  it("truncates oversized raw and provider previews", () => {
+    const longMessage = "X".repeat(260);
+    const observed = buildApiErrorObservationFields(
+      `{"type":"error","error":{"type":"server_error","message":"${longMessage}"},"request_id":"req_long"}`,
+    );
+
+    expect(observed.rawErrorPreview).toBeDefined();
+    expect(observed.providerErrorMessagePreview).toBeDefined();
+    expect(observed.rawErrorPreview?.length).toBeLessThanOrEqual(401);
+    expect(observed.providerErrorMessagePreview?.length).toBeLessThanOrEqual(201);
+    expect(observed.providerErrorMessagePreview?.endsWith("…")).toBe(true);
+  });
+
+  it("caps oversized raw inputs before hashing and fingerprinting", () => {
+    const oversized = "X".repeat(70_000);
+    const bounded = "X".repeat(64_000);
+
+    expect(buildApiErrorObservationFields(oversized)).toMatchObject({
+      rawErrorHash: buildApiErrorObservationFields(bounded).rawErrorHash,
+      rawErrorFingerprint: buildApiErrorObservationFields(bounded).rawErrorFingerprint,
+    });
+  });
+
+  it("returns empty observation fields for empty input", () => {
+    expect(buildApiErrorObservationFields(undefined)).toEqual({});
+    expect(buildApiErrorObservationFields("")).toEqual({});
+    expect(buildApiErrorObservationFields("   ")).toEqual({});
+  });
+
+  it("re-reads configured redact patterns on each call", () => {
+    const readLoggingConfig = vi.spyOn(loggingConfigModule, "readLoggingConfig");
+    readLoggingConfig.mockReturnValueOnce(undefined);
+    readLoggingConfig.mockReturnValueOnce({
+      redactPatterns: [String.raw`\bcustom-secret-[A-Za-z0-9]+\b`],
+    });
+
+    const first = buildApiErrorObservationFields("custom-secret-abc123");
+    const second = buildApiErrorObservationFields("custom-secret-abc123");
+
+    expect(first.rawErrorPreview).toContain("custom-secret-abc123");
+    expect(second.rawErrorPreview).not.toContain("custom-secret-abc123");
+    expect(second.rawErrorPreview).toContain("custom");
+  });
+
+  it("fails closed when observation sanitization throws", () => {
+    vi.spyOn(loggingConfigModule, "readLoggingConfig").mockImplementation(() => {
+      throw new Error("boom");
+    });
+
+    expect(buildApiErrorObservationFields("request_id=req_123")).toEqual({});
+    expect(buildTextObservationFields("request_id=req_123")).toEqual({
+      textPreview: undefined,
+      textHash: undefined,
+      textFingerprint: undefined,
+      httpCode: undefined,
+      providerErrorType: undefined,
+      providerErrorMessagePreview: undefined,
+      requestIdHash: undefined,
+    });
+  });
+
+  it("ignores non-string configured redact patterns", () => {
+    vi.spyOn(loggingConfigModule, "readLoggingConfig").mockReturnValue({
+      redactPatterns: [
+        123 as never,
+        { bad: true } as never,
+        String.raw`\bcustom-secret-[A-Za-z0-9]+\b`,
+      ],
+    });
+
+    const observed = buildApiErrorObservationFields("custom-secret-abc123");
+
+    expect(observed.rawErrorPreview).not.toContain("custom-secret-abc123");
+    expect(observed.rawErrorPreview).toContain("custom");
+  });
+});
+
+describe("sanitizeForConsole", () => {
+  it("strips control characters from console-facing values", () => {
+    expect(sanitizeForConsole("run-1\nprovider\tmodel\rtest")).toBe("run-1 provider model test");
+  });
+});
--- a/src/agents/pi-embedded-error-observation.ts
+++ b/src/agents/pi-embedded-error-observation.ts
@@ -0,0 +1,199 @@
+import { readLoggingConfig } from "../logging/config.js";
+import { redactIdentifier } from "../logging/redact-identifier.js";
+import { getDefaultRedactPatterns, redactSensitiveText } from "../logging/redact.js";
+import { getApiErrorPayloadFingerprint, parseApiErrorInfo } from "./pi-embedded-helpers.js";
+import { stableStringify } from "./stable-stringify.js";
+
+const MAX_OBSERVATION_INPUT_CHARS = 64_000;
+const MAX_FINGERPRINT_MESSAGE_CHARS = 8_000;
+const RAW_ERROR_PREVIEW_MAX_CHARS = 400;
+const PROVIDER_ERROR_PREVIEW_MAX_CHARS = 200;
+const REQUEST_ID_RE = /\brequest[_ ]?id\b\s*[:=]\s*["'()]*([A-Za-z0-9._:-]+)/i;
+const OBSERVATION_EXTRA_REDACT_PATTERNS = [
+  String.raw`\b(?:x-)?api[-_]?key\b\s*[:=]\s*(["']?)([^\s"'\\;]+)\1`,
+  String.raw`"(?:api[-_]?key|api_key)"\s*:\s*"([^"]+)"`,
+  String.raw`(?:\bCookie\b\s*[:=]\s*[^;=\s]+=|;\s*[^;=\s]+=)([^;\s\r\n]+)`,
+];
+
+function resolveConfiguredRedactPatterns(): string[] {
+  const configured = readLoggingConfig()?.redactPatterns;
+  if (!Array.isArray(configured)) {
+    return [];
+  }
+  return configured.filter((pattern): pattern is string => typeof pattern === "string");
+}
+
+function truncateForObservation(text: string | undefined, maxChars: number): string | undefined {
+  const trimmed = text?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  return trimmed.length > maxChars ? `${trimmed.slice(0, maxChars)}…` : trimmed;
+}
+
+function boundObservationInput(text: string | undefined): string | undefined {
+  const trimmed = text?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  return trimmed.length > MAX_OBSERVATION_INPUT_CHARS
+    ? trimmed.slice(0, MAX_OBSERVATION_INPUT_CHARS)
+    : trimmed;
+}
+
+export function sanitizeForConsole(text: string | undefined, maxChars = 200): string | undefined {
+  const trimmed = text?.trim();
+  if (!trimmed) {
+    return undefined;
+  }
+  const withoutControlChars = Array.from(trimmed)
+    .filter((char) => {
+      const code = char.charCodeAt(0);
+      return !(
+        code <= 0x08 ||
+        code === 0x0b ||
+        code === 0x0c ||
+        (code >= 0x0e && code <= 0x1f) ||
+        code === 0x7f
+      );
+    })
+    .join("");
+  const sanitized = withoutControlChars
+    .replace(/[\r\n\t]+/g, " ")
+    .replace(/\s+/g, " ")
+    .trim();
+  return sanitized.length > maxChars ? `${sanitized.slice(0, maxChars)}…` : sanitized;
+}
+
+function replaceRequestIdPreview(
+  text: string | undefined,
+  requestId: string | undefined,
+): string | undefined {
+  if (!text || !requestId) {
+    return text;
+  }
+  return text.split(requestId).join(redactIdentifier(requestId, { len: 12 }));
+}
+
+function redactObservationText(text: string | undefined): string | undefined {
+  if (!text) {
+    return text;
+  }
+  // Observation logs must stay redacted even when operators disable general-purpose
+  // log redaction, otherwise raw provider payloads leak back into always-on logs.
+  const configuredPatterns = resolveConfiguredRedactPatterns();
+  return redactSensitiveText(text, {
+    mode: "tools",
+    patterns: [
+      ...getDefaultRedactPatterns(),
+      ...configuredPatterns,
+      ...OBSERVATION_EXTRA_REDACT_PATTERNS,
+    ],
+  });
+}
+
+function extractRequestId(text: string | undefined): string | undefined {
+  if (!text) {
+    return undefined;
+  }
+  const match = text.match(REQUEST_ID_RE);
+  return match?.[1]?.trim() || undefined;
+}
+
+function buildObservationFingerprint(params: {
+  raw: string;
+  requestId?: string;
+  httpCode?: string;
+  type?: string;
+  message?: string;
+}): string | null {
+  const boundedMessage =
+    params.message && params.message.length > MAX_FINGERPRINT_MESSAGE_CHARS
+      ? params.message.slice(0, MAX_FINGERPRINT_MESSAGE_CHARS)
+      : params.message;
+  const structured =
+    params.httpCode || params.type || boundedMessage
+      ? stableStringify({
+          httpCode: params.httpCode,
+          type: params.type,
+          message: boundedMessage,
+        })
+      : null;
+  if (structured) {
+    return structured;
+  }
+  if (params.requestId) {
+    return params.raw.split(params.requestId).join("<request_id>");
+  }
+  return getApiErrorPayloadFingerprint(params.raw);
+}
+
+export function buildApiErrorObservationFields(rawError?: string): {
+  rawErrorPreview?: string;
+  rawErrorHash?: string;
+  rawErrorFingerprint?: string;
+  httpCode?: string;
+  providerErrorType?: string;
+  providerErrorMessagePreview?: string;
+  requestIdHash?: string;
+} {
+  const trimmed = boundObservationInput(rawError);
+  if (!trimmed) {
+    return {};
+  }
+  try {
+    const parsed = parseApiErrorInfo(trimmed);
+    const requestId = parsed?.requestId ?? extractRequestId(trimmed);
+    const requestIdHash = requestId ? redactIdentifier(requestId, { len: 12 }) : undefined;
+    const rawFingerprint = buildObservationFingerprint({
+      raw: trimmed,
+      requestId,
+      httpCode: parsed?.httpCode,
+      type: parsed?.type,
+      message: parsed?.message,
+    });
+    const redactedRawPreview = replaceRequestIdPreview(redactObservationText(trimmed), requestId);
+    const redactedProviderMessage = replaceRequestIdPreview(
+      redactObservationText(parsed?.message),
+      requestId,
+    );
+
+    return {
+      rawErrorPreview: truncateForObservation(redactedRawPreview, RAW_ERROR_PREVIEW_MAX_CHARS),
+      rawErrorHash: redactIdentifier(trimmed, { len: 12 }),
+      rawErrorFingerprint: rawFingerprint
+        ? redactIdentifier(rawFingerprint, { len: 12 })
+        : undefined,
+      httpCode: parsed?.httpCode,
+      providerErrorType: parsed?.type,
+      providerErrorMessagePreview: truncateForObservation(
+        redactedProviderMessage,
+        PROVIDER_ERROR_PREVIEW_MAX_CHARS,
+      ),
+      requestIdHash,
+    };
+  } catch {
+    return {};
+  }
+}
+
+export function buildTextObservationFields(text?: string): {
+  textPreview?: string;
+  textHash?: string;
+  textFingerprint?: string;
+  httpCode?: string;
+  providerErrorType?: string;
+  providerErrorMessagePreview?: string;
+  requestIdHash?: string;
+} {
+  const observed = buildApiErrorObservationFields(text);
+  return {
+    textPreview: observed.rawErrorPreview,
+    textHash: observed.rawErrorHash,
+    textFingerprint: observed.rawErrorFingerprint,
+    httpCode: observed.httpCode,
+    providerErrorType: observed.providerErrorType,
+    providerErrorMessagePreview: observed.providerErrorMessagePreview,
+    requestIdHash: observed.requestIdHash,
+  };
+}
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@@ -61,6 +61,7 @@ import { resolveGlobalLane, resolveSessionLane } from "./lanes.js";
 import { log } from "./logger.js";
 import { resolveModel } from "./model.js";
 import { runEmbeddedAttempt } from "./run/attempt.js";
+import { createFailoverDecisionLogger } from "./run/failover-observation.js";
 import type { RunEmbeddedPiAgentParams } from "./run/params.js";
 import { buildEmbeddedRunPayloads } from "./run/payloads.js";
 import {
@@ -1226,11 +1227,26 @@ export async function runEmbeddedPiAgent(
              reason: promptProfileFailureReason,
            });
            const promptFailoverFailure = isFailoverErrorMessage(errorText);
+            // Capture the failing profile before auth-profile rotation mutates `lastProfileId`.
+            const failedPromptProfileId = lastProfileId;
+            const logPromptFailoverDecision = createFailoverDecisionLogger({
+              stage: "prompt",
+              runId: params.runId,
+              rawError: errorText,
+              failoverReason: promptFailoverReason,
+              profileFailureReason: promptProfileFailureReason,
+              provider,
+              model: modelId,
+              profileId: failedPromptProfileId,
+              fallbackConfigured,
+              aborted,
+            });
            if (
              promptFailoverFailure &&
              promptFailoverReason !== "timeout" &&
              (await advanceAuthProfile())
            ) {
+              logPromptFailoverDecision("rotate_profile");
              await maybeBackoffBeforeOverloadFailover(promptFailoverReason);
              continue;
            }
@@ -1249,15 +1265,20 @@ export async function runEmbeddedPiAgent(
            // are configured so outer model fallback can continue on overload,
            // rate-limit, auth, or billing failures.
            if (fallbackConfigured && promptFailoverFailure) {
+              const status = resolveFailoverStatus(promptFailoverReason ?? "unknown");
+              logPromptFailoverDecision("fallback_model", { status });
              await maybeBackoffBeforeOverloadFailover(promptFailoverReason);
              throw new FailoverError(errorText, {
                reason: promptFailoverReason ?? "unknown",
                provider,
                model: modelId,
                profileId: lastProfileId,
-                status: resolveFailoverStatus(promptFailoverReason ?? "unknown"),
+                status,
              });
            }
+            if (promptFailoverFailure || promptFailoverReason) {
+              logPromptFailoverDecision("surface_error");
+            }
            throw promptError;
          }

@@ -1282,6 +1303,21 @@ export async function runEmbeddedPiAgent(
            resolveAuthProfileFailureReason(assistantFailoverReason);
          const cloudCodeAssistFormatError = attempt.cloudCodeAssistFormatError;
          const imageDimensionError = parseImageDimensionError(lastAssistant?.errorMessage ?? "");
+          // Capture the failing profile before auth-profile rotation mutates `lastProfileId`.
+          const failedAssistantProfileId = lastProfileId;
+          const logAssistantFailoverDecision = createFailoverDecisionLogger({
+            stage: "assistant",
+            runId: params.runId,
+            rawError: lastAssistant?.errorMessage?.trim(),
+            failoverReason: assistantFailoverReason,
+            profileFailureReason: assistantProfileFailureReason,
+            provider: activeErrorContext.provider,
+            model: activeErrorContext.model,
+            profileId: failedAssistantProfileId,
+            fallbackConfigured,
+            timedOut,
+            aborted,
+          });

          if (
            authFailure &&
@@ -1339,6 +1375,7 @@ export async function runEmbeddedPiAgent(

            const rotated = await advanceAuthProfile();
            if (rotated) {
+              logAssistantFailoverDecision("rotate_profile");
              await maybeBackoffBeforeOverloadFailover(assistantFailoverReason);
              continue;
            }
@@ -1371,6 +1408,7 @@ export async function runEmbeddedPiAgent(
              const status =
                resolveFailoverStatus(assistantFailoverReason ?? "unknown") ??
                (isTimeoutErrorMessage(message) ? 408 : undefined);
+              logAssistantFailoverDecision("fallback_model", { status });
              throw new FailoverError(message, {
                reason: assistantFailoverReason ?? "unknown",
                provider: activeErrorContext.provider,
@@ -1379,6 +1417,7 @@ export async function runEmbeddedPiAgent(
                status,
              });
            }
+            logAssistantFailoverDecision("surface_error");
          }

          const usage = toNormalizedUsage(usageAccumulator);
--- a/src/agents/pi-embedded-runner/run/failover-observation.test.ts
+++ b/src/agents/pi-embedded-runner/run/failover-observation.test.ts
@@ -0,0 +1,48 @@
+import { describe, expect, it } from "vitest";
+import { normalizeFailoverDecisionObservationBase } from "./failover-observation.js";
+
+describe("normalizeFailoverDecisionObservationBase", () => {
+  it("fills timeout observation reasons for deadline timeouts without provider error text", () => {
+    expect(
+      normalizeFailoverDecisionObservationBase({
+        stage: "assistant",
+        runId: "run:timeout",
+        rawError: "",
+        failoverReason: null,
+        profileFailureReason: null,
+        provider: "openai",
+        model: "mock-1",
+        profileId: "openai:p1",
+        fallbackConfigured: false,
+        timedOut: true,
+        aborted: false,
+      }),
+    ).toMatchObject({
+      failoverReason: "timeout",
+      profileFailureReason: "timeout",
+      timedOut: true,
+    });
+  });
+
+  it("preserves explicit failover reasons", () => {
+    expect(
+      normalizeFailoverDecisionObservationBase({
+        stage: "assistant",
+        runId: "run:overloaded",
+        rawError: '{"error":{"type":"overloaded_error"}}',
+        failoverReason: "overloaded",
+        profileFailureReason: "overloaded",
+        provider: "openai",
+        model: "mock-1",
+        profileId: "openai:p1",
+        fallbackConfigured: true,
+        timedOut: true,
+        aborted: false,
+      }),
+    ).toMatchObject({
+      failoverReason: "overloaded",
+      profileFailureReason: "overloaded",
+      timedOut: true,
+    });
+  });
+});
--- a/src/agents/pi-embedded-runner/run/failover-observation.ts
+++ b/src/agents/pi-embedded-runner/run/failover-observation.ts
@@ -0,0 +1,76 @@
+import { redactIdentifier } from "../../../logging/redact-identifier.js";
+import type { AuthProfileFailureReason } from "../../auth-profiles.js";
+import {
+  buildApiErrorObservationFields,
+  sanitizeForConsole,
+} from "../../pi-embedded-error-observation.js";
+import type { FailoverReason } from "../../pi-embedded-helpers.js";
+import { log } from "../logger.js";
+
+export type FailoverDecisionLoggerInput = {
+  stage: "prompt" | "assistant";
+  decision: "rotate_profile" | "fallback_model" | "surface_error";
+  runId?: string;
+  rawError?: string;
+  failoverReason: FailoverReason | null;
+  profileFailureReason?: AuthProfileFailureReason | null;
+  provider: string;
+  model: string;
+  profileId?: string;
+  fallbackConfigured: boolean;
+  timedOut?: boolean;
+  aborted?: boolean;
+  status?: number;
+};
+
+export type FailoverDecisionLoggerBase = Omit<FailoverDecisionLoggerInput, "decision" | "status">;
+
+export function normalizeFailoverDecisionObservationBase(
+  base: FailoverDecisionLoggerBase,
+): FailoverDecisionLoggerBase {
+  return {
+    ...base,
+    failoverReason: base.failoverReason ?? (base.timedOut ? "timeout" : null),
+    profileFailureReason: base.profileFailureReason ?? (base.timedOut ? "timeout" : null),
+  };
+}
+
+export function createFailoverDecisionLogger(
+  base: FailoverDecisionLoggerBase,
+): (
+  decision: FailoverDecisionLoggerInput["decision"],
+  extra?: Pick<FailoverDecisionLoggerInput, "status">,
+) => void {
+  const normalizedBase = normalizeFailoverDecisionObservationBase(base);
+  const safeProfileId = normalizedBase.profileId
+    ? redactIdentifier(normalizedBase.profileId, { len: 12 })
+    : undefined;
+  const safeRunId = sanitizeForConsole(normalizedBase.runId) ?? "-";
+  const safeProvider = sanitizeForConsole(normalizedBase.provider) ?? "-";
+  const safeModel = sanitizeForConsole(normalizedBase.model) ?? "-";
+  const profileText = safeProfileId ?? "-";
+  const reasonText = normalizedBase.failoverReason ?? "none";
+  return (decision, extra) => {
+    const observedError = buildApiErrorObservationFields(normalizedBase.rawError);
+    log.warn("embedded run failover decision", {
+      event: "embedded_run_failover_decision",
+      tags: ["error_handling", "failover", normalizedBase.stage, decision],
+      runId: normalizedBase.runId,
+      stage: normalizedBase.stage,
+      decision,
+      failoverReason: normalizedBase.failoverReason,
+      profileFailureReason: normalizedBase.profileFailureReason,
+      provider: normalizedBase.provider,
+      model: normalizedBase.model,
+      profileId: safeProfileId,
+      fallbackConfigured: normalizedBase.fallbackConfigured,
+      timedOut: normalizedBase.timedOut,
+      aborted: normalizedBase.aborted,
+      status: extra?.status,
+      ...observedError,
+      consoleMessage:
+        `embedded run failover decision: runId=${safeRunId} stage=${normalizedBase.stage} decision=${decision} ` +
+        `reason=${reasonText} provider=${safeProvider}/${safeModel} profile=${profileText}`,
+    });
+  };
+}
--- a/src/agents/pi-embedded-subscribe.handlers.lifecycle.test.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.lifecycle.test.ts
@@ -54,8 +54,13 @@ describe("handleAgentEnd", () => {

    const warn = vi.mocked(ctx.log.warn);
    expect(warn).toHaveBeenCalledTimes(1);
-    expect(warn.mock.calls[0]?.[0]).toContain("runId=run-1");
-    expect(warn.mock.calls[0]?.[0]).toContain("error=connection refused");
+    expect(warn.mock.calls[0]?.[0]).toBe("embedded run agent end");
+    expect(warn.mock.calls[0]?.[1]).toMatchObject({
+      event: "embedded_run_agent_end",
+      runId: "run-1",
+      error: "connection refused",
+      rawErrorPreview: "connection refused",
+    });
    expect(onAgentEvent).toHaveBeenCalledWith({
      stream: "lifecycle",
      data: {
@@ -65,6 +70,59 @@ describe("handleAgentEnd", () => {
    });
  });

+  it("attaches raw provider error metadata without changing the console message", () => {
+    const ctx = createContext({
+      role: "assistant",
+      stopReason: "error",
+      provider: "anthropic",
+      model: "claude-test",
+      errorMessage: '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',
+      content: [{ type: "text", text: "" }],
+    });
+
+    handleAgentEnd(ctx);
+
+    const warn = vi.mocked(ctx.log.warn);
+    expect(warn).toHaveBeenCalledTimes(1);
+    expect(warn.mock.calls[0]?.[0]).toBe("embedded run agent end");
+    expect(warn.mock.calls[0]?.[1]).toMatchObject({
+      event: "embedded_run_agent_end",
+      runId: "run-1",
+      error: "The AI service is temporarily overloaded. Please try again in a moment.",
+      failoverReason: "overloaded",
+      providerErrorType: "overloaded_error",
+    });
+  });
+
+  it("redacts logged error text before emitting lifecycle events", () => {
+    const onAgentEvent = vi.fn();
+    const ctx = createContext(
+      {
+        role: "assistant",
+        stopReason: "error",
+        errorMessage: "x-api-key: sk-abcdefghijklmnopqrstuvwxyz123456",
+        content: [{ type: "text", text: "" }],
+      },
+      { onAgentEvent },
+    );
+
+    handleAgentEnd(ctx);
+
+    const warn = vi.mocked(ctx.log.warn);
+    expect(warn.mock.calls[0]?.[1]).toMatchObject({
+      event: "embedded_run_agent_end",
+      error: "x-api-key: ***",
+      rawErrorPreview: "x-api-key: ***",
+    });
+    expect(onAgentEvent).toHaveBeenCalledWith({
+      stream: "lifecycle",
+      data: {
+        phase: "error",
+        error: "x-api-key: ***",
+      },
+    });
+  });
+
  it("keeps non-error run-end logging on debug only", () => {
    const ctx = createContext(undefined);

--- a/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts
@@ -1,6 +1,11 @@
 import { emitAgentEvent } from "../infra/agent-events.js";
 import { createInlineCodeState } from "../markdown/code-spans.js";
-import { formatAssistantErrorText } from "./pi-embedded-helpers.js";
+import {
+  buildApiErrorObservationFields,
+  buildTextObservationFields,
+  sanitizeForConsole,
+} from "./pi-embedded-error-observation.js";
+import { classifyFailoverReason, formatAssistantErrorText } from "./pi-embedded-helpers.js";
 import type { EmbeddedPiSubscribeContext } from "./pi-embedded-subscribe.handlers.types.js";
 import { isAssistantMessage } from "./pi-embedded-utils.js";

@@ -36,16 +41,31 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) {
      provider: lastAssistant.provider,
      model: lastAssistant.model,
    });
+    const rawError = lastAssistant.errorMessage?.trim();
+    const failoverReason = classifyFailoverReason(rawError ?? "");
    const errorText = (friendlyError || lastAssistant.errorMessage || "LLM request failed.").trim();
-    ctx.log.warn(
-      `embedded run agent end: runId=${ctx.params.runId} isError=true error=${errorText}`,
-    );
+    const observedError = buildApiErrorObservationFields(rawError);
+    const safeErrorText =
+      buildTextObservationFields(errorText).textPreview ?? "LLM request failed.";
+    const safeRunId = sanitizeForConsole(ctx.params.runId) ?? "-";
+    ctx.log.warn("embedded run agent end", {
+      event: "embedded_run_agent_end",
+      tags: ["error_handling", "lifecycle", "agent_end", "assistant_error"],
+      runId: ctx.params.runId,
+      isError: true,
+      error: safeErrorText,
+      failoverReason,
+      provider: lastAssistant.provider,
+      model: lastAssistant.model,
+      ...observedError,
+      consoleMessage: `embedded run agent end: runId=${safeRunId} isError=true error=${safeErrorText}`,
+    });
    emitAgentEvent({
      runId: ctx.params.runId,
      stream: "lifecycle",
      data: {
        phase: "error",
-        error: errorText,
+        error: safeErrorText,
        endedAt: Date.now(),
      },
    });
@@ -53,7 +73,7 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) {
      stream: "lifecycle",
      data: {
        phase: "error",
-        error: errorText,
+        error: safeErrorText,
      },
    });
  } else {
--- a/src/agents/pi-embedded-subscribe.handlers.types.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.types.ts
@@ -12,8 +12,8 @@ import type {
 import type { NormalizedUsage } from "./usage.js";

 export type EmbeddedSubscribeLogger = {
-  debug: (message: string) => void;
-  warn: (message: string) => void;
+  debug: (message: string, meta?: Record<string, unknown>) => void;
+  warn: (message: string, meta?: Record<string, unknown>) => void;
 };

 export type ToolErrorSummary = {