mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-08 10:01:24 +00:00
Memory/QMD: normalize Han-script BM25 search queries
This commit is contained in:
@@ -31,6 +31,7 @@ import type {
|
||||
ResolvedQmdMcporterConfig,
|
||||
} from "./backend-config.js";
|
||||
import { parseQmdQueryJson, type QmdQueryResult } from "./qmd-query-parser.js";
|
||||
import { extractKeywords } from "./query-expansion.js";
|
||||
|
||||
const log = createSubsystemLogger("memory");
|
||||
|
||||
@@ -40,9 +41,45 @@ const MAX_QMD_OUTPUT_CHARS = 200_000;
|
||||
const NUL_MARKER_RE = /(?:\^@|\\0|\\x00|\\u0000|null\s*byte|nul\s*byte)/i;
|
||||
const QMD_EMBED_BACKOFF_BASE_MS = 60_000;
|
||||
const QMD_EMBED_BACKOFF_MAX_MS = 60 * 60 * 1000;
|
||||
const HAN_SCRIPT_RE = /[\u3400-\u9fff]/u;
|
||||
const QMD_BM25_HAN_KEYWORD_LIMIT = 12;
|
||||
|
||||
let qmdEmbedQueueTail: Promise<void> = Promise.resolve();
|
||||
|
||||
function hasHanScript(value: string): boolean {
|
||||
return HAN_SCRIPT_RE.test(value);
|
||||
}
|
||||
|
||||
function normalizeHanBm25Query(query: string): string {
|
||||
const trimmed = query.trim();
|
||||
if (!trimmed || !hasHanScript(trimmed)) {
|
||||
return trimmed;
|
||||
}
|
||||
const keywords = extractKeywords(trimmed);
|
||||
const normalizedKeywords: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const keyword of keywords) {
|
||||
const token = keyword.trim();
|
||||
if (!token || seen.has(token)) {
|
||||
continue;
|
||||
}
|
||||
const includesHan = hasHanScript(token);
|
||||
// Han unigrams are usually too broad for BM25 and can drown signal.
|
||||
if (includesHan && Array.from(token).length < 2) {
|
||||
continue;
|
||||
}
|
||||
if (!includesHan && token.length < 2) {
|
||||
continue;
|
||||
}
|
||||
seen.add(token);
|
||||
normalizedKeywords.push(token);
|
||||
if (normalizedKeywords.length >= QMD_BM25_HAN_KEYWORD_LIMIT) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return normalizedKeywords.length > 0 ? normalizedKeywords.join(" ") : trimmed;
|
||||
}
|
||||
|
||||
async function runWithQmdEmbedLock<T>(task: () => Promise<T>): Promise<T> {
|
||||
const previous = qmdEmbedQueueTail;
|
||||
let release: (() => void) | undefined;
|
||||
@@ -1728,10 +1765,11 @@ export class QmdMemoryManager implements MemorySearchManager {
|
||||
query: string,
|
||||
limit: number,
|
||||
): string[] {
|
||||
const normalizedQuery = command === "search" ? normalizeHanBm25Query(query) : query;
|
||||
if (command === "query") {
|
||||
return ["query", query, "--json", "-n", String(limit)];
|
||||
return ["query", normalizedQuery, "--json", "-n", String(limit)];
|
||||
}
|
||||
return [command, query, "--json", "-n", String(limit)];
|
||||
return [command, normalizedQuery, "--json", "-n", String(limit)];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user