Agents: add fallback error observations (#41337)

Merged via squash. Prepared head SHA: 852469c82f Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com> Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com> Reviewed-by: @altaywtf
2026-06-07 22:09:57 +00:00 · 2026-03-10 01:12:10 +03:00
parent 3c3474360b
commit 531e8362b1
17 changed files with 502 additions and 36 deletions
--- a/src/agents/model-fallback.ts
+++ b/src/agents/model-fallback.ts
@@ -19,6 +19,8 @@ import {
  isFailoverError,
  isTimeoutError,
 } from "./failover-error.js";
+import { logModelFallbackDecision } from "./model-fallback-observation.js";
+import type { FallbackAttempt, ModelCandidate } from "./model-fallback.types.js";
 import {
  buildConfiguredAllowlistKeys,
  buildModelAliasIndex,
@@ -32,11 +34,6 @@ import { isLikelyContextOverflowError } from "./pi-embedded-helpers.js";

 const log = createSubsystemLogger("model-fallback");

-type ModelCandidate = {
-  provider: string;
-  model: string;
-};
-
 export type ModelFallbackRunOptions = {
  allowTransientCooldownProbe?: boolean;
 };
@@ -47,15 +44,6 @@ type ModelFallbackRunFn<T> = (
  options?: ModelFallbackRunOptions,
 ) => Promise<T>;

-type FallbackAttempt = {
-  provider: string;
-  model: string;
-  error: string;
-  reason?: FailoverReason;
-  status?: number;
-  code?: string;
-};
-
 /**
 * Fallback abort check. Only treats explicit AbortError names as user aborts.
 * Message-based checks (e.g., "aborted") can mask timeouts and skip fallback.
@@ -515,6 +503,7 @@ export async function runWithModelFallback<T>(params: {
  cfg: OpenClawConfig | undefined;
  provider: string;
  model: string;
+  runId?: string;
  agentDir?: string;
  /** Optional explicit fallbacks list; when provided (even empty), replaces agents.defaults.model.fallbacks. */
  fallbacksOverride?: string[];
@@ -537,7 +526,11 @@ export async function runWithModelFallback<T>(params: {

  for (let i = 0; i < candidates.length; i += 1) {
    const candidate = candidates[i];
+    const isPrimary = i === 0;
+    const requestedModel =
+      params.provider === candidate.provider && params.model === candidate.model;
    let runOptions: ModelFallbackRunOptions | undefined;
+    let attemptedDuringCooldown = false;
    if (authStore) {
      const profileIds = resolveAuthProfileOrder({
        cfg: params.cfg,
@@ -548,9 +541,6 @@ export async function runWithModelFallback<T>(params: {

      if (profileIds.length > 0 && !isAnyProfileAvailable) {
        // All profiles for this provider are in cooldown.
-        const isPrimary = i === 0;
-        const requestedModel =
-          params.provider === candidate.provider && params.model === candidate.model;
        const now = Date.now();
        const probeThrottleKey = resolveProbeThrottleKey(candidate.provider, params.agentDir);
        const decision = resolveCooldownDecision({
@@ -571,6 +561,22 @@ export async function runWithModelFallback<T>(params: {
            error: decision.error,
            reason: decision.reason,
          });
+          logModelFallbackDecision({
+            decision: "skip_candidate",
+            runId: params.runId,
+            requestedProvider: params.provider,
+            requestedModel: params.model,
+            candidate,
+            attempt: i + 1,
+            total: candidates.length,
+            reason: decision.reason,
+            error: decision.error,
+            nextCandidate: candidates[i + 1],
+            isPrimary,
+            requestedModelMatched: requestedModel,
+            fallbackConfigured: hasFallbackCandidates,
+            profileCount: profileIds.length,
+          });
          continue;
        }

@@ -584,6 +590,23 @@ export async function runWithModelFallback<T>(params: {
        ) {
          runOptions = { allowTransientCooldownProbe: true };
        }
+        attemptedDuringCooldown = true;
+        logModelFallbackDecision({
+          decision: "probe_cooldown_candidate",
+          runId: params.runId,
+          requestedProvider: params.provider,
+          requestedModel: params.model,
+          candidate,
+          attempt: i + 1,
+          total: candidates.length,
+          reason: decision.reason,
+          nextCandidate: candidates[i + 1],
+          isPrimary,
+          requestedModelMatched: requestedModel,
+          fallbackConfigured: hasFallbackCandidates,
+          allowTransientCooldownProbe: runOptions?.allowTransientCooldownProbe,
+          profileCount: profileIds.length,
+        });
      }
    }

@@ -594,6 +617,21 @@ export async function runWithModelFallback<T>(params: {
      options: runOptions,
    });
    if ("success" in attemptRun) {
+      if (i > 0 || attempts.length > 0 || attemptedDuringCooldown) {
+        logModelFallbackDecision({
+          decision: "candidate_succeeded",
+          runId: params.runId,
+          requestedProvider: params.provider,
+          requestedModel: params.model,
+          candidate,
+          attempt: i + 1,
+          total: candidates.length,
+          previousAttempts: attempts,
+          isPrimary,
+          requestedModelMatched: requestedModel,
+          fallbackConfigured: hasFallbackCandidates,
+        });
+      }
      const notFoundAttempt =
        i > 0 ? attempts.find((a) => a.reason === "model_not_found") : undefined;
      if (notFoundAttempt) {
@@ -637,6 +675,23 @@ export async function runWithModelFallback<T>(params: {
        status: described.status,
        code: described.code,
      });
+      logModelFallbackDecision({
+        decision: "candidate_failed",
+        runId: params.runId,
+        requestedProvider: params.provider,
+        requestedModel: params.model,
+        candidate,
+        attempt: i + 1,
+        total: candidates.length,
+        reason: described.reason,
+        status: described.status,
+        code: described.code,
+        error: described.message,
+        nextCandidate: candidates[i + 1],
+        isPrimary,
+        requestedModelMatched: requestedModel,
+        fallbackConfigured: hasFallbackCandidates,
+      });
      await params.onError?.({
        provider: candidate.provider,
        model: candidate.model,