fix(agents): handle overloaded failover separately (#38301)

* fix(agents): skip auth-profile failure on overload

* fix(agents): note overload auth-profile fallback fix

* fix(agents): classify overloaded failures separately

* fix(agents): back off before overload failover

* fix(agents): tighten overload probe and backoff state

* fix(agents): persist overloaded cooldown across runs

* fix(agents): tighten overloaded status handling

* test(agents): add overload regression coverage

* fix(agents): restore runner imports after rebase

* test(agents): add overload fallback integration coverage

* fix(agents): harden overloaded failover abort handling

* test(agents): tighten overload classifier coverage

* test(agents): cover all-overloaded fallback exhaustion

* fix(cron): retry overloaded fallback summaries

* fix(cron): treat HTTP 529 as overloaded retry
This commit is contained in:
Altay
2026-03-07 01:42:11 +03:00
committed by GitHub
parent 110ca23bab
commit 6e962d8b9e
36 changed files with 1036 additions and 84 deletions

View File

@@ -174,7 +174,7 @@ function runAgentAttempt(params: {
primaryProvider: string;
sessionStore?: Record<string, SessionEntry>;
storePath?: string;
allowRateLimitCooldownProbe?: boolean;
allowTransientCooldownProbe?: boolean;
}) {
const effectivePrompt = resolveFallbackRetryPrompt({
body: params.body,
@@ -325,7 +325,7 @@ function runAgentAttempt(params: {
inputProvenance: params.opts.inputProvenance,
streamParams: params.opts.streamParams,
agentDir: params.agentDir,
allowRateLimitCooldownProbe: params.allowRateLimitCooldownProbe,
allowTransientCooldownProbe: params.allowTransientCooldownProbe,
onAgentEvent: params.onAgentEvent,
bootstrapPromptWarningSignaturesSeen,
bootstrapPromptWarningSignature,
@@ -868,7 +868,7 @@ async function agentCommandInternal(
primaryProvider: provider,
sessionStore,
storePath,
allowRateLimitCooldownProbe: runOptions?.allowRateLimitCooldownProbe,
allowTransientCooldownProbe: runOptions?.allowTransientCooldownProbe,
onAgentEvent: (evt) => {
// Track lifecycle end for fallback emission below.
if (

View File

@@ -9,6 +9,7 @@ describe("mapFailoverReasonToProbeStatus", () => {
it("keeps existing failover reason mappings", () => {
expect(mapFailoverReasonToProbeStatus("auth")).toBe("auth");
expect(mapFailoverReasonToProbeStatus("rate_limit")).toBe("rate_limit");
expect(mapFailoverReasonToProbeStatus("overloaded")).toBe("rate_limit");
expect(mapFailoverReasonToProbeStatus("billing")).toBe("billing");
expect(mapFailoverReasonToProbeStatus("timeout")).toBe("timeout");
expect(mapFailoverReasonToProbeStatus("format")).toBe("format");

View File

@@ -106,7 +106,7 @@ export function mapFailoverReasonToProbeStatus(reason?: string | null): AuthProb
// surface in the auth bucket instead of showing as unknown.
return "auth";
}
if (reason === "rate_limit") {
if (reason === "rate_limit" || reason === "overloaded") {
return "rate_limit";
}
if (reason === "billing") {