Memory/QMD: normalize Han-script BM25 search queries

This commit is contained in:
Vignesh Natarajan
2026-02-22 01:53:00 -08:00
parent 9f0b6a8c92
commit 99a2f5379e
3 changed files with 156 additions and 2 deletions

View File

@@ -31,6 +31,7 @@ import type {
ResolvedQmdMcporterConfig,
} from "./backend-config.js";
import { parseQmdQueryJson, type QmdQueryResult } from "./qmd-query-parser.js";
import { extractKeywords } from "./query-expansion.js";
const log = createSubsystemLogger("memory");
@@ -40,9 +41,45 @@ const MAX_QMD_OUTPUT_CHARS = 200_000;
const NUL_MARKER_RE = /(?:\^@|\\0|\\x00|\\u0000|null\s*byte|nul\s*byte)/i;
const QMD_EMBED_BACKOFF_BASE_MS = 60_000;
const QMD_EMBED_BACKOFF_MAX_MS = 60 * 60 * 1000;
const HAN_SCRIPT_RE = /[\u3400-\u9fff]/u;
const QMD_BM25_HAN_KEYWORD_LIMIT = 12;
let qmdEmbedQueueTail: Promise<void> = Promise.resolve();
function hasHanScript(value: string): boolean {
return HAN_SCRIPT_RE.test(value);
}
function normalizeHanBm25Query(query: string): string {
const trimmed = query.trim();
if (!trimmed || !hasHanScript(trimmed)) {
return trimmed;
}
const keywords = extractKeywords(trimmed);
const normalizedKeywords: string[] = [];
const seen = new Set<string>();
for (const keyword of keywords) {
const token = keyword.trim();
if (!token || seen.has(token)) {
continue;
}
const includesHan = hasHanScript(token);
// Han unigrams are usually too broad for BM25 and can drown signal.
if (includesHan && Array.from(token).length < 2) {
continue;
}
if (!includesHan && token.length < 2) {
continue;
}
seen.add(token);
normalizedKeywords.push(token);
if (normalizedKeywords.length >= QMD_BM25_HAN_KEYWORD_LIMIT) {
break;
}
}
return normalizedKeywords.length > 0 ? normalizedKeywords.join(" ") : trimmed;
}
async function runWithQmdEmbedLock<T>(task: () => Promise<T>): Promise<T> {
const previous = qmdEmbedQueueTail;
let release: (() => void) | undefined;
@@ -1728,10 +1765,11 @@ export class QmdMemoryManager implements MemorySearchManager {
query: string,
limit: number,
): string[] {
const normalizedQuery = command === "search" ? normalizeHanBm25Query(query) : query;
if (command === "query") {
return ["query", query, "--json", "-n", String(limit)];
return ["query", normalizedQuery, "--json", "-n", String(limit)];
}
return [command, query, "--json", "-n", String(limit)];
return [command, normalizedQuery, "--json", "-n", String(limit)];
}
}