mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-09 18:54:31 +00:00
fix(gateway): stop stale-socket restarts before first event (#38643)
* fix(gateway): guard stale-socket restarts by event liveness * fix(gateway): centralize connect-time liveness tracking * fix(web): apply connected status patch atomically * fix(gateway): require active socket for stale checks * fix(gateway): ignore inherited stale event timestamps
This commit is contained in:
@@ -35,6 +35,7 @@ Docs: https://docs.openclaw.ai
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
- Gateway/Telegram stale-socket restart guard: only apply stale-socket restarts to channels that publish event-liveness timestamps, preventing Telegram providers from being misclassified as stale solely due to long uptime and avoiding restart/pairing storms after upgrade. (openclaw#38464)
|
||||||
- Onboarding/headless Linux daemon probe hardening: treat `systemctl --user is-enabled` probe failures as non-fatal during daemon install flow so onboarding no longer crashes on SSH/headless VPS environments before showing install guidance. (#37297) Thanks @acarbajal-web.
|
- Onboarding/headless Linux daemon probe hardening: treat `systemctl --user is-enabled` probe failures as non-fatal during daemon install flow so onboarding no longer crashes on SSH/headless VPS environments before showing install guidance. (#37297) Thanks @acarbajal-web.
|
||||||
- Memory/QMD mcporter Windows spawn hardening: when `mcporter.cmd` launch fails with `spawn EINVAL`, retry via bare `mcporter` shell resolution so QMD recall can continue instead of falling back to builtin memory search. (#27402) Thanks @i0ivi0i.
|
- Memory/QMD mcporter Windows spawn hardening: when `mcporter.cmd` launch fails with `spawn EINVAL`, retry via bare `mcporter` shell resolution so QMD recall can continue instead of falling back to builtin memory search. (#27402) Thanks @i0ivi0i.
|
||||||
- Tools/web_search Brave language-code validation: align `search_lang` handling with Brave-supported codes (including `zh-hans`, `zh-hant`, `en-gb`, and `pt-br`), map common alias inputs (`zh`, `ja`) to valid Brave values, and reject unsupported codes before upstream requests to prevent 422 failures. (#37260) Thanks @heyanming.
|
- Tools/web_search Brave language-code validation: align `search_lang` handling with Brave-supported codes (including `zh-hans`, `zh-hant`, `en-gb`, and `pt-br`), map common alias inputs (`zh`, `ja`) to valid Brave values, and reject unsupported codes before upstream requests to prevent 422 failures. (#37260) Thanks @heyanming.
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import type { Client } from "@buape/carbon";
|
import type { Client } from "@buape/carbon";
|
||||||
import type { GatewayPlugin } from "@buape/carbon/gateway";
|
import type { GatewayPlugin } from "@buape/carbon/gateway";
|
||||||
import { createArmableStallWatchdog } from "../../channels/transport/stall-watchdog.js";
|
import { createArmableStallWatchdog } from "../../channels/transport/stall-watchdog.js";
|
||||||
|
import { createConnectedChannelStatusPatch } from "../../gateway/channel-status-patches.js";
|
||||||
import { danger } from "../../globals.js";
|
import { danger } from "../../globals.js";
|
||||||
import type { RuntimeEnv } from "../../runtime.js";
|
import type { RuntimeEnv } from "../../runtime.js";
|
||||||
import { attachDiscordGatewayLogging } from "../gateway-logging.js";
|
import { attachDiscordGatewayLogging } from "../gateway-logging.js";
|
||||||
@@ -180,8 +181,7 @@ export async function runDiscordGatewayLifecycle(params: {
|
|||||||
let sawConnected = gateway?.isConnected === true;
|
let sawConnected = gateway?.isConnected === true;
|
||||||
if (sawConnected) {
|
if (sawConnected) {
|
||||||
pushStatus({
|
pushStatus({
|
||||||
connected: true,
|
...createConnectedChannelStatusPatch(at),
|
||||||
lastConnectedAt: at,
|
|
||||||
lastDisconnect: null,
|
lastDisconnect: null,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -194,9 +194,7 @@ export async function runDiscordGatewayLifecycle(params: {
|
|||||||
const connectedAt = Date.now();
|
const connectedAt = Date.now();
|
||||||
reconnectStallWatchdog.disarm();
|
reconnectStallWatchdog.disarm();
|
||||||
pushStatus({
|
pushStatus({
|
||||||
connected: true,
|
...createConnectedChannelStatusPatch(connectedAt),
|
||||||
lastEventAt: connectedAt,
|
|
||||||
lastConnectedAt: connectedAt,
|
|
||||||
lastDisconnect: null,
|
lastDisconnect: null,
|
||||||
});
|
});
|
||||||
if (helloConnectedPollId) {
|
if (helloConnectedPollId) {
|
||||||
@@ -253,9 +251,7 @@ export async function runDiscordGatewayLifecycle(params: {
|
|||||||
if (gateway?.isConnected && !lifecycleStopping) {
|
if (gateway?.isConnected && !lifecycleStopping) {
|
||||||
const at = Date.now();
|
const at = Date.now();
|
||||||
pushStatus({
|
pushStatus({
|
||||||
connected: true,
|
...createConnectedChannelStatusPatch(at),
|
||||||
lastEventAt: at,
|
|
||||||
lastConnectedAt: at,
|
|
||||||
lastDisconnect: null,
|
lastDisconnect: null,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ import {
|
|||||||
resolveDefaultGroupPolicy,
|
resolveDefaultGroupPolicy,
|
||||||
warnMissingProviderGroupPolicyFallbackOnce,
|
warnMissingProviderGroupPolicyFallbackOnce,
|
||||||
} from "../../config/runtime-group-policy.js";
|
} from "../../config/runtime-group-policy.js";
|
||||||
|
import { createConnectedChannelStatusPatch } from "../../gateway/channel-status-patches.js";
|
||||||
import { danger, logVerbose, shouldLogVerbose, warn } from "../../globals.js";
|
import { danger, logVerbose, shouldLogVerbose, warn } from "../../globals.js";
|
||||||
import { formatErrorMessage } from "../../infra/errors.js";
|
import { formatErrorMessage } from "../../infra/errors.js";
|
||||||
import { createDiscordRetryRunner } from "../../infra/retry-policy.js";
|
import { createDiscordRetryRunner } from "../../infra/retry-policy.js";
|
||||||
@@ -752,7 +753,7 @@ export async function monitorDiscordProvider(opts: MonitorDiscordOpts = {}) {
|
|||||||
botUserId && botUserName ? `${botUserId} (${botUserName})` : (botUserId ?? botUserName ?? "");
|
botUserId && botUserName ? `${botUserId} (${botUserName})` : (botUserId ?? botUserName ?? "");
|
||||||
runtime.log?.(`logged in to discord${botIdentity ? ` as ${botIdentity}` : ""}`);
|
runtime.log?.(`logged in to discord${botIdentity ? ` as ${botIdentity}` : ""}`);
|
||||||
if (lifecycleGateway?.isConnected) {
|
if (lifecycleGateway?.isConnected) {
|
||||||
opts.setStatus?.({ connected: true });
|
opts.setStatus?.(createConnectedChannelStatusPatch());
|
||||||
}
|
}
|
||||||
|
|
||||||
lifecycleStarted = true;
|
lifecycleStarted = true;
|
||||||
|
|||||||
@@ -489,16 +489,34 @@ describe("channel-health-monitor", () => {
|
|||||||
await expectNoRestart(manager);
|
await expectNoRestart(manager);
|
||||||
});
|
});
|
||||||
|
|
||||||
it("restarts a channel that never received any event past the stale threshold", async () => {
|
it("restarts a channel that has seen no events since connect past the stale threshold", async () => {
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
const manager = createSlackSnapshotManager(
|
const manager = createSlackSnapshotManager(
|
||||||
runningConnectedSlackAccount({
|
runningConnectedSlackAccount({
|
||||||
lastStartAt: now - STALE_THRESHOLD - 60_000,
|
lastStartAt: now - STALE_THRESHOLD - 60_000,
|
||||||
|
lastEventAt: now - STALE_THRESHOLD - 60_000,
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
await expectRestartedChannel(manager, "slack");
|
await expectRestartedChannel(manager, "slack");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("skips connected channels that do not report event liveness", async () => {
|
||||||
|
const now = Date.now();
|
||||||
|
const manager = createSnapshotManager({
|
||||||
|
telegram: {
|
||||||
|
default: {
|
||||||
|
running: true,
|
||||||
|
connected: true,
|
||||||
|
enabled: true,
|
||||||
|
configured: true,
|
||||||
|
lastStartAt: now - STALE_THRESHOLD - 60_000,
|
||||||
|
lastEventAt: null,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
await expectNoRestart(manager);
|
||||||
|
});
|
||||||
|
|
||||||
it("respects custom staleEventThresholdMs", async () => {
|
it("respects custom staleEventThresholdMs", async () => {
|
||||||
const customThreshold = 10 * 60_000;
|
const customThreshold = 10 * 60_000;
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ describe("evaluateChannelHealth", () => {
|
|||||||
enabled: true,
|
enabled: true,
|
||||||
configured: true,
|
configured: true,
|
||||||
lastStartAt: 0,
|
lastStartAt: 0,
|
||||||
lastEventAt: null,
|
lastEventAt: 0,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
channelId: "discord",
|
channelId: "discord",
|
||||||
@@ -142,6 +142,65 @@ describe("evaluateChannelHealth", () => {
|
|||||||
);
|
);
|
||||||
expect(evaluation).toEqual({ healthy: true, reason: "healthy" });
|
expect(evaluation).toEqual({ healthy: true, reason: "healthy" });
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("does not flag stale sockets for channels without event tracking", () => {
|
||||||
|
const evaluation = evaluateChannelHealth(
|
||||||
|
{
|
||||||
|
running: true,
|
||||||
|
connected: true,
|
||||||
|
enabled: true,
|
||||||
|
configured: true,
|
||||||
|
lastStartAt: 0,
|
||||||
|
lastEventAt: null,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
channelId: "discord",
|
||||||
|
now: 100_000,
|
||||||
|
channelConnectGraceMs: 10_000,
|
||||||
|
staleEventThresholdMs: 30_000,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
expect(evaluation).toEqual({ healthy: true, reason: "healthy" });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("does not flag stale sockets without an active connected socket", () => {
|
||||||
|
const evaluation = evaluateChannelHealth(
|
||||||
|
{
|
||||||
|
running: true,
|
||||||
|
enabled: true,
|
||||||
|
configured: true,
|
||||||
|
lastStartAt: 0,
|
||||||
|
lastEventAt: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
channelId: "slack",
|
||||||
|
now: 100_000,
|
||||||
|
channelConnectGraceMs: 10_000,
|
||||||
|
staleEventThresholdMs: 30_000,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
expect(evaluation).toEqual({ healthy: true, reason: "healthy" });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("ignores inherited event timestamps from a previous lifecycle", () => {
|
||||||
|
const evaluation = evaluateChannelHealth(
|
||||||
|
{
|
||||||
|
running: true,
|
||||||
|
connected: true,
|
||||||
|
enabled: true,
|
||||||
|
configured: true,
|
||||||
|
lastStartAt: 50_000,
|
||||||
|
lastEventAt: 10_000,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
channelId: "slack",
|
||||||
|
now: 100_000,
|
||||||
|
channelConnectGraceMs: 10_000,
|
||||||
|
staleEventThresholdMs: 30_000,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
expect(evaluation).toEqual({ healthy: true, reason: "healthy" });
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("resolveChannelRestartReason", () => {
|
describe("resolveChannelRestartReason", () => {
|
||||||
|
|||||||
@@ -103,17 +103,17 @@ export function evaluateChannelHealth(
|
|||||||
// Skip stale-socket check for Telegram (long-polling mode). Each polling request
|
// Skip stale-socket check for Telegram (long-polling mode). Each polling request
|
||||||
// acts as a heartbeat, so the half-dead WebSocket scenario this check is designed
|
// acts as a heartbeat, so the half-dead WebSocket scenario this check is designed
|
||||||
// to catch does not apply to Telegram's long-polling architecture.
|
// to catch does not apply to Telegram's long-polling architecture.
|
||||||
if (policy.channelId !== "telegram") {
|
if (
|
||||||
if (snapshot.lastEventAt != null || snapshot.lastStartAt != null) {
|
policy.channelId !== "telegram" &&
|
||||||
const upSince = snapshot.lastStartAt ?? 0;
|
snapshot.connected === true &&
|
||||||
const upDuration = policy.now - upSince;
|
snapshot.lastEventAt != null
|
||||||
if (upDuration > policy.staleEventThresholdMs) {
|
) {
|
||||||
const lastEvent = snapshot.lastEventAt ?? 0;
|
if (lastStartAt != null && snapshot.lastEventAt < lastStartAt) {
|
||||||
const eventAge = policy.now - lastEvent;
|
return { healthy: true, reason: "healthy" };
|
||||||
if (eventAge > policy.staleEventThresholdMs) {
|
}
|
||||||
return { healthy: false, reason: "stale-socket" };
|
const eventAge = policy.now - snapshot.lastEventAt;
|
||||||
}
|
if (eventAge > policy.staleEventThresholdMs) {
|
||||||
}
|
return { healthy: false, reason: "stale-socket" };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return { healthy: true, reason: "healthy" };
|
return { healthy: true, reason: "healthy" };
|
||||||
|
|||||||
12
src/gateway/channel-status-patches.test.ts
Normal file
12
src/gateway/channel-status-patches.test.ts
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
import { createConnectedChannelStatusPatch } from "./channel-status-patches.js";
|
||||||
|
|
||||||
|
describe("createConnectedChannelStatusPatch", () => {
|
||||||
|
it("uses one timestamp for connected event-liveness state", () => {
|
||||||
|
expect(createConnectedChannelStatusPatch(1234)).toEqual({
|
||||||
|
connected: true,
|
||||||
|
lastConnectedAt: 1234,
|
||||||
|
lastEventAt: 1234,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
15
src/gateway/channel-status-patches.ts
Normal file
15
src/gateway/channel-status-patches.ts
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
export type ConnectedChannelStatusPatch = {
|
||||||
|
connected: true;
|
||||||
|
lastConnectedAt: number;
|
||||||
|
lastEventAt: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
export function createConnectedChannelStatusPatch(
|
||||||
|
at: number = Date.now(),
|
||||||
|
): ConnectedChannelStatusPatch {
|
||||||
|
return {
|
||||||
|
connected: true,
|
||||||
|
lastConnectedAt: at,
|
||||||
|
lastEventAt: at,
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
import { describe, expect, it } from "vitest";
|
import { describe, expect, it, vi } from "vitest";
|
||||||
import { __testing } from "./provider.js";
|
import { __testing } from "./provider.js";
|
||||||
|
|
||||||
class FakeEmitter {
|
class FakeEmitter {
|
||||||
@@ -22,6 +22,22 @@ class FakeEmitter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
describe("slack socket reconnect helpers", () => {
|
describe("slack socket reconnect helpers", () => {
|
||||||
|
it("seeds event liveness when socket mode connects", () => {
|
||||||
|
const setStatus = vi.fn();
|
||||||
|
|
||||||
|
__testing.publishSlackConnectedStatus(setStatus);
|
||||||
|
|
||||||
|
expect(setStatus).toHaveBeenCalledTimes(1);
|
||||||
|
expect(setStatus).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
connected: true,
|
||||||
|
lastConnectedAt: expect.any(Number),
|
||||||
|
lastEventAt: expect.any(Number),
|
||||||
|
lastError: null,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
it("resolves disconnect waiter on socket disconnect event", async () => {
|
it("resolves disconnect waiter on socket disconnect event", async () => {
|
||||||
const client = new FakeEmitter();
|
const client = new FakeEmitter();
|
||||||
const app = { receiver: { client } };
|
const app = { receiver: { client } };
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ import {
|
|||||||
} from "../../config/runtime-group-policy.js";
|
} from "../../config/runtime-group-policy.js";
|
||||||
import type { SessionScope } from "../../config/sessions.js";
|
import type { SessionScope } from "../../config/sessions.js";
|
||||||
import { normalizeResolvedSecretInputString } from "../../config/types.secrets.js";
|
import { normalizeResolvedSecretInputString } from "../../config/types.secrets.js";
|
||||||
|
import { createConnectedChannelStatusPatch } from "../../gateway/channel-status-patches.js";
|
||||||
import { warn } from "../../globals.js";
|
import { warn } from "../../globals.js";
|
||||||
import { computeBackoff, sleepWithAbort } from "../../infra/backoff.js";
|
import { computeBackoff, sleepWithAbort } from "../../infra/backoff.js";
|
||||||
import { installRequestBodyLimitGuard } from "../../infra/http-body.js";
|
import { installRequestBodyLimitGuard } from "../../infra/http-body.js";
|
||||||
@@ -65,6 +66,17 @@ function parseApiAppIdFromAppToken(raw?: string) {
|
|||||||
return match?.[1]?.toUpperCase();
|
return match?.[1]?.toUpperCase();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function publishSlackConnectedStatus(setStatus?: (next: Record<string, unknown>) => void) {
|
||||||
|
if (!setStatus) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const now = Date.now();
|
||||||
|
setStatus({
|
||||||
|
...createConnectedChannelStatusPatch(now),
|
||||||
|
lastError: null,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
export async function monitorSlackProvider(opts: MonitorSlackOpts = {}) {
|
export async function monitorSlackProvider(opts: MonitorSlackOpts = {}) {
|
||||||
const cfg = opts.config ?? loadConfig();
|
const cfg = opts.config ?? loadConfig();
|
||||||
const runtime: RuntimeEnv = opts.runtime ?? createNonExitingRuntime();
|
const runtime: RuntimeEnv = opts.runtime ?? createNonExitingRuntime();
|
||||||
@@ -390,6 +402,7 @@ export async function monitorSlackProvider(opts: MonitorSlackOpts = {}) {
|
|||||||
try {
|
try {
|
||||||
await app.start();
|
await app.start();
|
||||||
reconnectAttempts = 0;
|
reconnectAttempts = 0;
|
||||||
|
publishSlackConnectedStatus(opts.setStatus);
|
||||||
runtime.log?.("slack socket mode connected");
|
runtime.log?.("slack socket mode connected");
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
// Auth errors (account_inactive, invalid_auth, etc.) are permanent —
|
// Auth errors (account_inactive, invalid_auth, etc.) are permanent —
|
||||||
@@ -481,6 +494,7 @@ export async function monitorSlackProvider(opts: MonitorSlackOpts = {}) {
|
|||||||
export { isNonRecoverableSlackAuthError } from "./reconnect-policy.js";
|
export { isNonRecoverableSlackAuthError } from "./reconnect-policy.js";
|
||||||
|
|
||||||
export const __testing = {
|
export const __testing = {
|
||||||
|
publishSlackConnectedStatus,
|
||||||
resolveSlackRuntimeGroupPolicy: resolveOpenProviderRuntimeGroupPolicy,
|
resolveSlackRuntimeGroupPolicy: resolveOpenProviderRuntimeGroupPolicy,
|
||||||
resolveDefaultGroupPolicy,
|
resolveDefaultGroupPolicy,
|
||||||
getSocketEmitter,
|
getSocketEmitter,
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import { DEFAULT_GROUP_HISTORY_LIMIT } from "../../auto-reply/reply/history.js";
|
|||||||
import { formatCliCommand } from "../../cli/command-format.js";
|
import { formatCliCommand } from "../../cli/command-format.js";
|
||||||
import { waitForever } from "../../cli/wait.js";
|
import { waitForever } from "../../cli/wait.js";
|
||||||
import { loadConfig } from "../../config/config.js";
|
import { loadConfig } from "../../config/config.js";
|
||||||
|
import { createConnectedChannelStatusPatch } from "../../gateway/channel-status-patches.js";
|
||||||
import { logVerbose } from "../../globals.js";
|
import { logVerbose } from "../../globals.js";
|
||||||
import { formatDurationPrecise } from "../../infra/format-time/format-duration.ts";
|
import { formatDurationPrecise } from "../../infra/format-time/format-duration.ts";
|
||||||
import { enqueueSystemEvent } from "../../infra/system-events.js";
|
import { enqueueSystemEvent } from "../../infra/system-events.js";
|
||||||
@@ -210,9 +211,7 @@ export async function monitorWebChannel(
|
|||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
status.connected = true;
|
Object.assign(status, createConnectedChannelStatusPatch());
|
||||||
status.lastConnectedAt = Date.now();
|
|
||||||
status.lastEventAt = status.lastConnectedAt;
|
|
||||||
status.lastError = null;
|
status.lastError = null;
|
||||||
emitStatus();
|
emitStatus();
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user