fix(agents): handle overloaded failover separately (#38301)

* fix(agents): skip auth-profile failure on overload

* fix(agents): note overload auth-profile fallback fix

* fix(agents): classify overloaded failures separately

* fix(agents): back off before overload failover

* fix(agents): tighten overload probe and backoff state

* fix(agents): persist overloaded cooldown across runs

* fix(agents): tighten overloaded status handling

* test(agents): add overload regression coverage

* fix(agents): restore runner imports after rebase

* test(agents): add overload fallback integration coverage

* fix(agents): harden overloaded failover abort handling

* test(agents): tighten overload classifier coverage

* test(agents): cover all-overloaded fallback exhaustion

* fix(cron): retry overloaded fallback summaries

* fix(cron): treat HTTP 529 as overloaded retry
This commit is contained in:
Altay
2026-03-07 01:42:11 +03:00
committed by GitHub
parent 110ca23bab
commit 6e962d8b9e
36 changed files with 1036 additions and 84 deletions

View File

@@ -39,6 +39,7 @@ export type AuthProfileFailureReason =
| "auth"
| "auth_permanent"
| "format"
| "overloaded"
| "rate_limit"
| "billing"
| "timeout"

View File

@@ -177,6 +177,24 @@ describe("resolveProfilesUnavailableReason", () => {
).toBe("auth");
});
it("returns overloaded for active overloaded cooldown windows", () => {
const now = Date.now();
const store = makeStore({
"anthropic:default": {
cooldownUntil: now + 60_000,
failureCounts: { overloaded: 2, rate_limit: 1 },
},
});
expect(
resolveProfilesUnavailableReason({
store,
profileIds: ["anthropic:default"],
now,
}),
).toBe("overloaded");
});
it("falls back to rate_limit when active cooldown has no reason history", () => {
const now = Date.now();
const store = makeStore({

View File

@@ -9,6 +9,7 @@ const FAILURE_REASON_PRIORITY: AuthProfileFailureReason[] = [
"billing",
"format",
"model_not_found",
"overloaded",
"timeout",
"rate_limit",
"unknown",
@@ -35,7 +36,7 @@ export function resolveProfileUnusableUntil(
}
/**
* Check if a profile is currently in cooldown (due to rate limiting or errors).
* Check if a profile is currently in cooldown (due to rate limits, overload, or other transient failures).
*/
export function isProfileInCooldown(
store: AuthProfileStore,
@@ -508,7 +509,7 @@ export async function markAuthProfileFailure(params: {
}
/**
* Mark a profile as failed/rate-limited. Applies exponential backoff cooldown.
* Mark a profile as transiently failed. Applies exponential backoff cooldown.
* Cooldown times: 1min, 5min, 25min, max 1 hour.
* Uses store lock to avoid overwriting concurrent usage updates.
*/