fix(agents): avoid duplicate same-provider cooldown probes in fallback runs (#41711)

Merged via squash.

Prepared head SHA: 8be8967bcb
Co-authored-by: cgdusek <38732970+cgdusek@users.noreply.github.com>
Co-authored-by: altaywtf <9790196+altaywtf@users.noreply.github.com>
Reviewed-by: @altaywtf
This commit is contained in:
Charles Dusek
2026-03-10 07:26:47 -05:00
committed by GitHub
parent bda63c3c7f
commit 048e25c2b2
3 changed files with 129 additions and 0 deletions

View File

@@ -521,6 +521,7 @@ export async function runWithModelFallback<T>(params: {
: null;
const attempts: FallbackAttempt[] = [];
let lastError: unknown;
const cooldownProbeUsedProviders = new Set<string>();
const hasFallbackCandidates = candidates.length > 1;
@@ -531,6 +532,7 @@ export async function runWithModelFallback<T>(params: {
params.provider === candidate.provider && params.model === candidate.model;
let runOptions: ModelFallbackRunOptions | undefined;
let attemptedDuringCooldown = false;
let transientProbeProviderForAttempt: string | null = null;
if (authStore) {
const profileIds = resolveAuthProfileOrder({
cfg: params.cfg,
@@ -588,7 +590,41 @@ export async function runWithModelFallback<T>(params: {
decision.reason === "overloaded" ||
decision.reason === "billing"
) {
// Probe at most once per provider per fallback run when all profiles
// are cooldowned. Re-probing every same-provider candidate can stall
// cross-provider fallback on providers with long internal retries.
const isTransientCooldownReason =
decision.reason === "rate_limit" || decision.reason === "overloaded";
if (isTransientCooldownReason && cooldownProbeUsedProviders.has(candidate.provider)) {
const error = `Provider ${candidate.provider} is in cooldown (probe already attempted this run)`;
attempts.push({
provider: candidate.provider,
model: candidate.model,
error,
reason: decision.reason,
});
logModelFallbackDecision({
decision: "skip_candidate",
runId: params.runId,
requestedProvider: params.provider,
requestedModel: params.model,
candidate,
attempt: i + 1,
total: candidates.length,
reason: decision.reason,
error,
nextCandidate: candidates[i + 1],
isPrimary,
requestedModelMatched: requestedModel,
fallbackConfigured: hasFallbackCandidates,
profileCount: profileIds.length,
});
continue;
}
runOptions = { allowTransientCooldownProbe: true };
if (isTransientCooldownReason) {
transientProbeProviderForAttempt = candidate.provider;
}
}
attemptedDuringCooldown = true;
logModelFallbackDecision({
@@ -643,6 +679,18 @@ export async function runWithModelFallback<T>(params: {
}
const err = attemptRun.error;
{
if (transientProbeProviderForAttempt) {
const probeFailureReason = describeFailoverError(err).reason;
const shouldPreserveTransientProbeSlot =
probeFailureReason === "model_not_found" ||
probeFailureReason === "format" ||
probeFailureReason === "auth" ||
probeFailureReason === "auth_permanent" ||
probeFailureReason === "session_expired";
if (!shouldPreserveTransientProbeSlot) {
cooldownProbeUsedProviders.add(transientProbeProviderForAttempt);
}
}
// Context overflow errors should be handled by the inner runner's
// compaction/retry logic, not by model fallback. If one escapes as a
// throw, rethrow it immediately rather than trying a different model