fix(agents): avoid double websocket retry accounting on reconnect failures (#39133)

Co-authored-by: scoootscooob <zhentongfan@gmail.com>
This commit is contained in:
Peter Steinberger
2026-03-07 19:55:13 +00:00
parent ac86deccee
commit 3c1176110a
3 changed files with 53 additions and 5 deletions

View File

@@ -506,6 +506,53 @@ describe("OpenAIWebSocketManager", () => {
expect(maxRetryError).toBeDefined();
});
it("does not double-count retries when error and close both fire on a reconnect attempt", async () => {
// In the real `ws` library, a failed connection fires "error" followed
// by "close". Previously, both the onClose handler AND the promise
// .catch() in _scheduleReconnect called _scheduleReconnect(), which
// double-incremented retryCount and exhausted the retry budget
// prematurely (e.g. 3 retries became ~1-2 actual attempts).
const manager = buildManager({ maxRetries: 3, backoffDelaysMs: [5, 5, 5] });
const errors = attachErrorCollector(manager);
const p = manager.connect("sk-test");
lastSocket().simulateOpen();
await p;
// Drop the established connection — triggers first reconnect schedule
lastSocket().simulateClose(1006, "Network error");
// Advance past first retry delay — a new socket is created
await vi.advanceTimersByTimeAsync(10);
const sock2 = lastSocket();
// Simulate a realistic failure: error fires first, then close follows.
sock2.simulateError(new Error("ECONNREFUSED"));
sock2.simulateClose(1006, "Connection failed");
// Advance past second retry delay — another socket should be created
// because we've only used 2 retries (not 3 from double-counting).
await vi.advanceTimersByTimeAsync(10);
const sock3 = lastSocket();
expect(sock3).not.toBe(sock2);
// Third attempt also fails with error+close
sock3.simulateError(new Error("ECONNREFUSED"));
sock3.simulateClose(1006, "Connection failed");
// Advance past third retry delay — one more attempt (retry 3 of 3)
await vi.advanceTimersByTimeAsync(10);
const sock4 = lastSocket();
expect(sock4).not.toBe(sock3);
// Fourth socket also fails — now retries should be exhausted (3/3)
sock4.simulateError(new Error("ECONNREFUSED"));
sock4.simulateClose(1006, "Connection failed");
await vi.advanceTimersByTimeAsync(10);
const maxRetryError = errors.find((e) => e.message.includes("max reconnect retries"));
expect(maxRetryError).toBeDefined();
});
it("resets retry count after a successful reconnect", async () => {
const manager = buildManager({ maxRetries: 3, backoffDelaysMs: [5, 10, 20] });
const p = manager.connect("sk-test");

View File

@@ -446,11 +446,11 @@ export class OpenAIWebSocketManager extends EventEmitter<InternalEvents> {
if (this.closed) {
return;
}
this._openConnection().catch((err: unknown) => {
// onError handler already emitted error event; schedule next retry.
void err;
this._scheduleReconnect();
});
// The onClose handler already calls _scheduleReconnect() for the next
// attempt, so we intentionally swallow the rejection here to avoid
// double-scheduling (which would double-increment retryCount per
// failed reconnect and exhaust the retry budget prematurely).
this._openConnection().catch(() => {});
}, delayMs);
}