mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-10 01:52:44 +00:00
fix: replace file-based session store lock with in-process Promise chain mutex (#14498)
* fix: replace file-based session store lock with in-process Promise chain mutex
Node.js is single-threaded, so file-based locking (open('wx') + polling +
stale eviction) is unnecessary and causes timeouts under heavy session load.
Replace with a simple per-storePath Promise chain that serializes access
without any filesystem overhead.
In a 1159-session environment over 3 hours:
- Lock timeouts: 25
- Stuck sessions: 157 (max 1031s, avg 388s)
- Slow listeners: 39 (max 265s, avg 70s)
Root cause: during sessions.json file I/O, await yields control and other
lock requests hit the 10s timeout waiting for the .lock file to be released.
* test: add comprehensive tests for Promise chain mutex lock
- Concurrent access serialization (10 parallel writers, counter integrity)
- Error resilience (single & multiple consecutive throws don't poison queue)
- Independent storePath parallelism (different paths run concurrently)
- LOCK_QUEUES cleanup after completion and after errors
- No .lock file created on disk
Also fix: store caught promise in LOCK_QUEUES to avoid unhandled rejection
warnings when queued fn() throws.
* fix: add timeout to Promise chain mutex to prevent infinite hangs on Windows
* fix(session-store): enforce strict queue timeout + cross-process lock
---------
Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
committed by
GitHub
parent
13bfd9da83
commit
c6ecd2a044
@@ -3,6 +3,7 @@ import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import type { MsgContext } from "../../auto-reply/templating.js";
|
||||
import type { SessionMaintenanceConfig, SessionMaintenanceMode } from "../types.base.js";
|
||||
import { acquireSessionWriteLock } from "../../agents/session-write-lock.js";
|
||||
import { parseByteSize } from "../../cli/parse-bytes.js";
|
||||
import { parseDurationMs } from "../../cli/parse-duration.js";
|
||||
import { createSubsystemLogger } from "../../logging/subsystem.js";
|
||||
@@ -115,6 +116,28 @@ function normalizeSessionStore(store: Record<string, SessionEntry>): void {
|
||||
|
||||
export function clearSessionStoreCacheForTest(): void {
|
||||
SESSION_STORE_CACHE.clear();
|
||||
for (const queue of LOCK_QUEUES.values()) {
|
||||
for (const task of queue.pending) {
|
||||
task.timedOut = true;
|
||||
if (task.timer) {
|
||||
clearTimeout(task.timer);
|
||||
}
|
||||
}
|
||||
}
|
||||
LOCK_QUEUES.clear();
|
||||
}
|
||||
|
||||
/** Expose lock queue size for tests. */
|
||||
export function getSessionStoreLockQueueSizeForTest(): number {
|
||||
return LOCK_QUEUES.size;
|
||||
}
|
||||
|
||||
export async function withSessionStoreLockForTest<T>(
|
||||
storePath: string,
|
||||
fn: () => Promise<T>,
|
||||
opts: SessionStoreLockOptions = {},
|
||||
): Promise<T> {
|
||||
return await withSessionStoreLock(storePath, fn, opts);
|
||||
}
|
||||
|
||||
type LoadSessionStoreOptions = {
|
||||
@@ -584,76 +607,149 @@ type SessionStoreLockOptions = {
|
||||
staleMs?: number;
|
||||
};
|
||||
|
||||
type SessionStoreLockTask = {
|
||||
fn: () => Promise<unknown>;
|
||||
resolve: (value: unknown) => void;
|
||||
reject: (reason: unknown) => void;
|
||||
timeoutAt?: number;
|
||||
staleMs: number;
|
||||
timer?: ReturnType<typeof setTimeout>;
|
||||
started: boolean;
|
||||
timedOut: boolean;
|
||||
};
|
||||
|
||||
type SessionStoreLockQueue = {
|
||||
running: boolean;
|
||||
pending: SessionStoreLockTask[];
|
||||
};
|
||||
|
||||
const LOCK_QUEUES = new Map<string, SessionStoreLockQueue>();
|
||||
|
||||
function lockTimeoutError(storePath: string): Error {
|
||||
return new Error(`timeout waiting for session store lock: ${storePath}`);
|
||||
}
|
||||
|
||||
function getOrCreateLockQueue(storePath: string): SessionStoreLockQueue {
|
||||
const existing = LOCK_QUEUES.get(storePath);
|
||||
if (existing) {
|
||||
return existing;
|
||||
}
|
||||
const created: SessionStoreLockQueue = { running: false, pending: [] };
|
||||
LOCK_QUEUES.set(storePath, created);
|
||||
return created;
|
||||
}
|
||||
|
||||
function removePendingTask(queue: SessionStoreLockQueue, task: SessionStoreLockTask): void {
|
||||
const idx = queue.pending.indexOf(task);
|
||||
if (idx >= 0) {
|
||||
queue.pending.splice(idx, 1);
|
||||
}
|
||||
}
|
||||
|
||||
async function drainSessionStoreLockQueue(storePath: string): Promise<void> {
|
||||
const queue = LOCK_QUEUES.get(storePath);
|
||||
if (!queue || queue.running) {
|
||||
return;
|
||||
}
|
||||
queue.running = true;
|
||||
try {
|
||||
while (queue.pending.length > 0) {
|
||||
const task = queue.pending.shift();
|
||||
if (!task || task.timedOut) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (task.timer) {
|
||||
clearTimeout(task.timer);
|
||||
}
|
||||
task.started = true;
|
||||
|
||||
const remainingTimeoutMs =
|
||||
task.timeoutAt != null
|
||||
? Math.max(0, task.timeoutAt - Date.now())
|
||||
: Number.POSITIVE_INFINITY;
|
||||
if (task.timeoutAt != null && remainingTimeoutMs <= 0) {
|
||||
task.timedOut = true;
|
||||
task.reject(lockTimeoutError(storePath));
|
||||
continue;
|
||||
}
|
||||
|
||||
let lock: { release: () => Promise<void> } | undefined;
|
||||
let result: unknown;
|
||||
let failed: unknown;
|
||||
let hasFailure = false;
|
||||
try {
|
||||
lock = await acquireSessionWriteLock({
|
||||
sessionFile: storePath,
|
||||
timeoutMs: remainingTimeoutMs,
|
||||
staleMs: task.staleMs,
|
||||
});
|
||||
result = await task.fn();
|
||||
} catch (err) {
|
||||
hasFailure = true;
|
||||
failed = err;
|
||||
} finally {
|
||||
await lock?.release().catch(() => undefined);
|
||||
}
|
||||
if (hasFailure) {
|
||||
task.reject(failed);
|
||||
continue;
|
||||
}
|
||||
task.resolve(result);
|
||||
}
|
||||
} finally {
|
||||
queue.running = false;
|
||||
if (queue.pending.length === 0) {
|
||||
LOCK_QUEUES.delete(storePath);
|
||||
} else {
|
||||
queueMicrotask(() => {
|
||||
void drainSessionStoreLockQueue(storePath);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function withSessionStoreLock<T>(
|
||||
storePath: string,
|
||||
fn: () => Promise<T>,
|
||||
opts: SessionStoreLockOptions = {},
|
||||
): Promise<T> {
|
||||
const timeoutMs = opts.timeoutMs ?? 10_000;
|
||||
const pollIntervalMs = opts.pollIntervalMs ?? 25;
|
||||
const staleMs = opts.staleMs ?? 30_000;
|
||||
const lockPath = `${storePath}.lock`;
|
||||
const startedAt = Date.now();
|
||||
// `pollIntervalMs` is retained for API compatibility with older lock options.
|
||||
void opts.pollIntervalMs;
|
||||
|
||||
await fs.promises.mkdir(path.dirname(storePath), { recursive: true });
|
||||
const hasTimeout = timeoutMs > 0 && Number.isFinite(timeoutMs);
|
||||
const timeoutAt = hasTimeout ? Date.now() + timeoutMs : undefined;
|
||||
const queue = getOrCreateLockQueue(storePath);
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
const handle = await fs.promises.open(lockPath, "wx");
|
||||
try {
|
||||
await handle.writeFile(
|
||||
JSON.stringify({ pid: process.pid, startedAt: Date.now() }),
|
||||
"utf-8",
|
||||
);
|
||||
} catch {
|
||||
// best-effort
|
||||
}
|
||||
await handle.close();
|
||||
break;
|
||||
} catch (err) {
|
||||
const code =
|
||||
err && typeof err === "object" && "code" in err
|
||||
? String((err as { code?: unknown }).code)
|
||||
: null;
|
||||
if (code === "ENOENT") {
|
||||
// Store directory may be deleted/recreated in tests while writes are in-flight.
|
||||
// Best-effort: recreate the parent dir and retry until timeout.
|
||||
await fs.promises
|
||||
.mkdir(path.dirname(storePath), { recursive: true })
|
||||
.catch(() => undefined);
|
||||
await new Promise((r) => setTimeout(r, pollIntervalMs));
|
||||
continue;
|
||||
}
|
||||
if (code !== "EEXIST") {
|
||||
throw err;
|
||||
}
|
||||
const promise = new Promise<T>((resolve, reject) => {
|
||||
const task: SessionStoreLockTask = {
|
||||
fn: async () => await fn(),
|
||||
resolve: (value) => resolve(value as T),
|
||||
reject,
|
||||
timeoutAt,
|
||||
staleMs,
|
||||
started: false,
|
||||
timedOut: false,
|
||||
};
|
||||
|
||||
const now = Date.now();
|
||||
if (now - startedAt > timeoutMs) {
|
||||
throw new Error(`timeout acquiring session store lock: ${lockPath}`, { cause: err });
|
||||
}
|
||||
|
||||
// Best-effort stale lock eviction (e.g. crashed process).
|
||||
try {
|
||||
const st = await fs.promises.stat(lockPath);
|
||||
const ageMs = now - st.mtimeMs;
|
||||
if (ageMs > staleMs) {
|
||||
await fs.promises.unlink(lockPath);
|
||||
continue;
|
||||
if (hasTimeout) {
|
||||
task.timer = setTimeout(() => {
|
||||
if (task.started || task.timedOut) {
|
||||
return;
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
|
||||
await new Promise((r) => setTimeout(r, pollIntervalMs));
|
||||
task.timedOut = true;
|
||||
removePendingTask(queue, task);
|
||||
reject(lockTimeoutError(storePath));
|
||||
}, timeoutMs);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
return await fn();
|
||||
} finally {
|
||||
await fs.promises.unlink(lockPath).catch(() => undefined);
|
||||
}
|
||||
queue.pending.push(task);
|
||||
void drainSessionStoreLockQueue(storePath);
|
||||
});
|
||||
|
||||
return await promise;
|
||||
}
|
||||
|
||||
export async function updateSessionStoreEntry(params: {
|
||||
|
||||
Reference in New Issue
Block a user