fix(agents): tighten overload probe and backoff state

This commit is contained in:
Altay
2026-03-06 15:47:23 +03:00
parent c7148f1a66
commit 890dbf523f
4 changed files with 69 additions and 1 deletions

View File

@@ -707,6 +707,15 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
expect(computeBackoffMock).toHaveBeenCalledTimes(1);
expect(computeBackoffMock).toHaveBeenCalledWith(
expect.objectContaining({
initialMs: 250,
maxMs: 1500,
factor: 2,
jitter: 0.2,
}),
1,
);
expect(sleepWithAbortMock).toHaveBeenCalledTimes(1);
expect(sleepWithAbortMock).toHaveBeenCalledWith(321, undefined);
});
@@ -720,6 +729,15 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
expect(typeof usageStats["openai:p2"]?.lastUsed).toBe("number");
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
expect(computeBackoffMock).toHaveBeenCalledTimes(1);
expect(computeBackoffMock).toHaveBeenCalledWith(
expect.objectContaining({
initialMs: 250,
maxMs: 1500,
factor: 2,
jitter: 0.2,
}),
1,
);
expect(sleepWithAbortMock).toHaveBeenCalledTimes(1);
expect(sleepWithAbortMock).toHaveBeenCalledWith(321, undefined);
});
@@ -746,6 +764,54 @@ describe("runEmbeddedPiAgent auth profile rotation", () => {
expect(usageStats["openai:p1"]?.cooldownUntil).toBeUndefined();
});
it("resets overload failover backoff after a successful turn", async () => {
await withAgentWorkspace(async ({ agentDir, workspaceDir }) => {
await writeAuthStore(agentDir);
mockFailedThenSuccessfulAttempt(
'{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',
);
await runAutoPinnedOpenAiTurn({
agentDir,
workspaceDir,
sessionKey: "agent:test:overloaded-backoff-reset-1",
runId: "run:overloaded-backoff-reset-1",
});
mockFailedThenSuccessfulAttempt(
'{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"}}',
);
await runAutoPinnedOpenAiTurn({
agentDir,
workspaceDir,
sessionKey: "agent:test:overloaded-backoff-reset-2",
runId: "run:overloaded-backoff-reset-2",
});
expect(computeBackoffMock).toHaveBeenCalledTimes(2);
expect(computeBackoffMock).toHaveBeenNthCalledWith(
1,
expect.objectContaining({
initialMs: 250,
maxMs: 1500,
factor: 2,
jitter: 0.2,
}),
1,
);
expect(computeBackoffMock).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
initialMs: 250,
maxMs: 1500,
factor: 2,
jitter: 0.2,
}),
1,
);
});
});
it("does not rotate for compaction timeouts", async () => {
await withAgentWorkspace(async ({ agentDir, workspaceDir }) => {
await writeAuthStore(agentDir);

View File

@@ -1400,6 +1400,7 @@ export async function runEmbeddedPiAgent(
`embedded run done: runId=${params.runId} sessionId=${params.sessionId} durationMs=${Date.now() - started} aborted=${aborted}`,
);
if (lastProfileId) {
overloadFailoverAttempts = 0;
await markAuthProfileGood({
store: authStore,
provider,

View File

@@ -9,6 +9,7 @@ describe("mapFailoverReasonToProbeStatus", () => {
it("keeps existing failover reason mappings", () => {
expect(mapFailoverReasonToProbeStatus("auth")).toBe("auth");
expect(mapFailoverReasonToProbeStatus("rate_limit")).toBe("rate_limit");
expect(mapFailoverReasonToProbeStatus("overloaded")).toBe("rate_limit");
expect(mapFailoverReasonToProbeStatus("billing")).toBe("billing");
expect(mapFailoverReasonToProbeStatus("timeout")).toBe("timeout");
expect(mapFailoverReasonToProbeStatus("format")).toBe("format");

View File

@@ -106,7 +106,7 @@ export function mapFailoverReasonToProbeStatus(reason?: string | null): AuthProb
// surface in the auth bucket instead of showing as unknown.
return "auth";
}
if (reason === "rate_limit") {
if (reason === "rate_limit" || reason === "overloaded") {
return "rate_limit";
}
if (reason === "billing") {