fix: defer gateway restart until all replies are sent (#12970)

* fix: defer gateway restart until all replies are sent

Fixes a race condition where gateway config changes (e.g., enabling
plugins via iMessage) trigger an immediate SIGUSR1 restart, killing the
iMessage RPC connection before replies are delivered.

Both restart paths (config watcher and RPC-triggered) now defer until
all queued operations, pending replies, and embedded agent runs complete
(polling every 500ms, 30s timeout). A shared emitGatewayRestart() guard
prevents double SIGUSR1 when both paths fire simultaneously.

Key changes:
- Dispatcher registry tracks active reply dispatchers globally
- markComplete() called in finally block for guaranteed cleanup
- Pre-restart deferral hook registered at gateway startup
- Centralized extractDeliveryInfo() for session key parsing
- Post-restart sentinel messages delivered directly (not via agent)
- config-patch distinguished from config-apply in sentinel kind

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix: single-source gateway restart authorization

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
Bridgerz
2026-02-13 15:29:29 -08:00
committed by GitHub
parent dc507f3dec
commit ab4a08a82a
21 changed files with 976 additions and 76 deletions

View File

@@ -9,6 +9,7 @@ import {
isGatewaySigusr1RestartExternallyAllowed,
scheduleGatewaySigusr1Restart,
setGatewaySigusr1RestartPolicy,
setPreRestartDeferralCheck,
} from "./restart.js";
import { createTelegramRetryRunner } from "./retry-policy.js";
import { getShellPathFromLoginShell, resetShellPathCacheForTests } from "./shell-env.js";
@@ -79,11 +80,15 @@ describe("infra runtime", () => {
__testing.resetSigusr1State();
});
it("consumes a scheduled authorization once", async () => {
it("authorizes exactly once when scheduled restart emits", async () => {
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(false);
scheduleGatewaySigusr1Restart({ delayMs: 0 });
// No pre-authorization before the scheduled emission fires.
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(false);
await vi.advanceTimersByTimeAsync(0);
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(true);
expect(consumeGatewaySigusr1RestartAuthorization()).toBe(false);
@@ -97,6 +102,110 @@ describe("infra runtime", () => {
});
});
describe("pre-restart deferral check", () => {
beforeEach(() => {
__testing.resetSigusr1State();
vi.useFakeTimers();
vi.spyOn(process, "kill").mockImplementation(() => true);
});
afterEach(async () => {
await vi.runOnlyPendingTimersAsync();
vi.useRealTimers();
vi.restoreAllMocks();
__testing.resetSigusr1State();
});
it("emits SIGUSR1 immediately when no deferral check is registered", async () => {
const emitSpy = vi.spyOn(process, "emit");
const handler = () => {};
process.on("SIGUSR1", handler);
try {
scheduleGatewaySigusr1Restart({ delayMs: 0 });
await vi.advanceTimersByTimeAsync(0);
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
} finally {
process.removeListener("SIGUSR1", handler);
}
});
it("emits SIGUSR1 immediately when deferral check returns 0", async () => {
const emitSpy = vi.spyOn(process, "emit");
const handler = () => {};
process.on("SIGUSR1", handler);
try {
setPreRestartDeferralCheck(() => 0);
scheduleGatewaySigusr1Restart({ delayMs: 0 });
await vi.advanceTimersByTimeAsync(0);
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
} finally {
process.removeListener("SIGUSR1", handler);
}
});
it("defers SIGUSR1 until deferral check returns 0", async () => {
const emitSpy = vi.spyOn(process, "emit");
const handler = () => {};
process.on("SIGUSR1", handler);
try {
let pending = 2;
setPreRestartDeferralCheck(() => pending);
scheduleGatewaySigusr1Restart({ delayMs: 0 });
// After initial delay fires, deferral check returns 2 — should NOT emit yet
await vi.advanceTimersByTimeAsync(0);
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
// After one poll (500ms), still pending
await vi.advanceTimersByTimeAsync(500);
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
// Drain pending work
pending = 0;
await vi.advanceTimersByTimeAsync(500);
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
} finally {
process.removeListener("SIGUSR1", handler);
}
});
it("emits SIGUSR1 after deferral timeout even if still pending", async () => {
const emitSpy = vi.spyOn(process, "emit");
const handler = () => {};
process.on("SIGUSR1", handler);
try {
setPreRestartDeferralCheck(() => 5); // always pending
scheduleGatewaySigusr1Restart({ delayMs: 0 });
// Fire initial timeout
await vi.advanceTimersByTimeAsync(0);
expect(emitSpy).not.toHaveBeenCalledWith("SIGUSR1");
// Advance past the 30s max deferral wait
await vi.advanceTimersByTimeAsync(30_000);
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
} finally {
process.removeListener("SIGUSR1", handler);
}
});
it("emits SIGUSR1 if deferral check throws", async () => {
const emitSpy = vi.spyOn(process, "emit");
const handler = () => {};
process.on("SIGUSR1", handler);
try {
setPreRestartDeferralCheck(() => {
throw new Error("boom");
});
scheduleGatewaySigusr1Restart({ delayMs: 0 });
await vi.advanceTimersByTimeAsync(0);
expect(emitSpy).toHaveBeenCalledWith("SIGUSR1");
} finally {
process.removeListener("SIGUSR1", handler);
}
});
});
describe("getShellPathFromLoginShell", () => {
afterEach(() => resetShellPathCacheForTests());