mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-09 12:47:39 +00:00
feat: LLM-based query expansion for FTS mode
When searching in FTS-only mode (no embedding provider), extract meaningful keywords from conversational queries using LLM to improve search results. Changes: - New query-expansion module with keyword extraction - Supports English and Chinese stop word filtering - Null safety guards for FTS-only mode (provider can be null) - Lint compliance fixes for string iteration This helps users find relevant memory entries even with vague queries.
This commit is contained in:
@@ -72,7 +72,7 @@ class MemoryManagerEmbeddingOps {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private loadEmbeddingCache(hashes: string[]): Map<string, number[]> {
|
private loadEmbeddingCache(hashes: string[]): Map<string, number[]> {
|
||||||
if (!this.cache.enabled) {
|
if (!this.cache.enabled || !this.provider) {
|
||||||
return new Map();
|
return new Map();
|
||||||
}
|
}
|
||||||
if (hashes.length === 0) {
|
if (hashes.length === 0) {
|
||||||
@@ -114,7 +114,7 @@ class MemoryManagerEmbeddingOps {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private upsertEmbeddingCache(entries: Array<{ hash: string; embedding: number[] }>): void {
|
private upsertEmbeddingCache(entries: Array<{ hash: string; embedding: number[] }>): void {
|
||||||
if (!this.cache.enabled) {
|
if (!this.cache.enabled || !this.provider) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (entries.length === 0) {
|
if (entries.length === 0) {
|
||||||
@@ -245,6 +245,9 @@ class MemoryManagerEmbeddingOps {
|
|||||||
entry: MemoryFileEntry | SessionFileEntry,
|
entry: MemoryFileEntry | SessionFileEntry,
|
||||||
source: MemorySource,
|
source: MemorySource,
|
||||||
): Promise<number[][]> {
|
): Promise<number[][]> {
|
||||||
|
if (!this.provider) {
|
||||||
|
return this.embedChunksInBatches(chunks);
|
||||||
|
}
|
||||||
if (this.provider.id === "openai" && this.openAi) {
|
if (this.provider.id === "openai" && this.openAi) {
|
||||||
return this.embedChunksWithOpenAiBatch(chunks, entry, source);
|
return this.embedChunksWithOpenAiBatch(chunks, entry, source);
|
||||||
}
|
}
|
||||||
@@ -423,7 +426,7 @@ class MemoryManagerEmbeddingOps {
|
|||||||
method: "POST",
|
method: "POST",
|
||||||
url: OPENAI_BATCH_ENDPOINT,
|
url: OPENAI_BATCH_ENDPOINT,
|
||||||
body: {
|
body: {
|
||||||
model: this.openAi?.model ?? this.provider.model,
|
model: this.openAi?.model ?? this.provider?.model ?? "text-embedding-3-small",
|
||||||
input: chunk.text,
|
input: chunk.text,
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
@@ -493,6 +496,9 @@ class MemoryManagerEmbeddingOps {
|
|||||||
if (texts.length === 0) {
|
if (texts.length === 0) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
if (!this.provider) {
|
||||||
|
throw new Error("Cannot embed batch in FTS-only mode (no embedding provider)");
|
||||||
|
}
|
||||||
let attempt = 0;
|
let attempt = 0;
|
||||||
let delayMs = EMBEDDING_RETRY_BASE_DELAY_MS;
|
let delayMs = EMBEDDING_RETRY_BASE_DELAY_MS;
|
||||||
while (true) {
|
while (true) {
|
||||||
@@ -532,7 +538,7 @@ class MemoryManagerEmbeddingOps {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private resolveEmbeddingTimeout(kind: "query" | "batch"): number {
|
private resolveEmbeddingTimeout(kind: "query" | "batch"): number {
|
||||||
const isLocal = this.provider.id === "local";
|
const isLocal = this.provider?.id === "local";
|
||||||
if (kind === "query") {
|
if (kind === "query") {
|
||||||
return isLocal ? EMBEDDING_QUERY_TIMEOUT_LOCAL_MS : EMBEDDING_QUERY_TIMEOUT_REMOTE_MS;
|
return isLocal ? EMBEDDING_QUERY_TIMEOUT_LOCAL_MS : EMBEDDING_QUERY_TIMEOUT_REMOTE_MS;
|
||||||
}
|
}
|
||||||
@@ -540,6 +546,9 @@ class MemoryManagerEmbeddingOps {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private async embedQueryWithTimeout(text: string): Promise<number[]> {
|
private async embedQueryWithTimeout(text: string): Promise<number[]> {
|
||||||
|
if (!this.provider) {
|
||||||
|
throw new Error("Cannot embed query in FTS-only mode (no embedding provider)");
|
||||||
|
}
|
||||||
const timeoutMs = this.resolveEmbeddingTimeout("query");
|
const timeoutMs = this.resolveEmbeddingTimeout("query");
|
||||||
log.debug("memory embeddings: query start", { provider: this.provider.id, timeoutMs });
|
log.debug("memory embeddings: query start", { provider: this.provider.id, timeoutMs });
|
||||||
return await this.withTimeout(
|
return await this.withTimeout(
|
||||||
@@ -685,6 +694,15 @@ class MemoryManagerEmbeddingOps {
|
|||||||
entry: MemoryFileEntry | SessionFileEntry,
|
entry: MemoryFileEntry | SessionFileEntry,
|
||||||
options: { source: MemorySource; content?: string },
|
options: { source: MemorySource; content?: string },
|
||||||
) {
|
) {
|
||||||
|
// FTS-only mode: skip indexing if no provider
|
||||||
|
if (!this.provider) {
|
||||||
|
log.debug("Skipping embedding indexing in FTS-only mode", {
|
||||||
|
path: entry.path,
|
||||||
|
source: options.source,
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8"));
|
const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8"));
|
||||||
const chunks = enforceEmbeddingMaxInputTokens(
|
const chunks = enforceEmbeddingMaxInputTokens(
|
||||||
this.provider,
|
this.provider,
|
||||||
|
|||||||
@@ -544,6 +544,12 @@ class MemoryManagerSyncOps {
|
|||||||
needsFullReindex: boolean;
|
needsFullReindex: boolean;
|
||||||
progress?: MemorySyncProgressState;
|
progress?: MemorySyncProgressState;
|
||||||
}) {
|
}) {
|
||||||
|
// FTS-only mode: skip embedding sync (no provider)
|
||||||
|
if (!this.provider) {
|
||||||
|
log.debug("Skipping memory file sync in FTS-only mode (no embedding provider)");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const files = await listMemoryFiles(this.workspaceDir, this.settings.extraPaths);
|
const files = await listMemoryFiles(this.workspaceDir, this.settings.extraPaths);
|
||||||
const fileEntries = await Promise.all(
|
const fileEntries = await Promise.all(
|
||||||
files.map(async (file) => buildFileEntry(file, this.workspaceDir)),
|
files.map(async (file) => buildFileEntry(file, this.workspaceDir)),
|
||||||
@@ -619,6 +625,12 @@ class MemoryManagerSyncOps {
|
|||||||
needsFullReindex: boolean;
|
needsFullReindex: boolean;
|
||||||
progress?: MemorySyncProgressState;
|
progress?: MemorySyncProgressState;
|
||||||
}) {
|
}) {
|
||||||
|
// FTS-only mode: skip embedding sync (no provider)
|
||||||
|
if (!this.provider) {
|
||||||
|
log.debug("Skipping session file sync in FTS-only mode (no embedding provider)");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const files = await listSessionFilesForAgent(this.agentId);
|
const files = await listSessionFilesForAgent(this.agentId);
|
||||||
const activePaths = new Set(files.map((file) => sessionPathForFile(file)));
|
const activePaths = new Set(files.map((file) => sessionPathForFile(file)));
|
||||||
const indexAll = params.needsFullReindex || this.sessionsDirtyFiles.size === 0;
|
const indexAll = params.needsFullReindex || this.sessionsDirtyFiles.size === 0;
|
||||||
@@ -759,8 +771,8 @@ class MemoryManagerSyncOps {
|
|||||||
const needsFullReindex =
|
const needsFullReindex =
|
||||||
params?.force ||
|
params?.force ||
|
||||||
!meta ||
|
!meta ||
|
||||||
meta.model !== this.provider.model ||
|
(this.provider && meta.model !== this.provider.model) ||
|
||||||
meta.provider !== this.provider.id ||
|
(this.provider && meta.provider !== this.provider.id) ||
|
||||||
meta.providerKey !== this.providerKey ||
|
meta.providerKey !== this.providerKey ||
|
||||||
meta.chunkTokens !== this.settings.chunking.tokens ||
|
meta.chunkTokens !== this.settings.chunking.tokens ||
|
||||||
meta.chunkOverlap !== this.settings.chunking.overlap ||
|
meta.chunkOverlap !== this.settings.chunking.overlap ||
|
||||||
@@ -834,6 +846,7 @@ class MemoryManagerSyncOps {
|
|||||||
const batch = this.settings.remote?.batch;
|
const batch = this.settings.remote?.batch;
|
||||||
const enabled = Boolean(
|
const enabled = Boolean(
|
||||||
batch?.enabled &&
|
batch?.enabled &&
|
||||||
|
this.provider &&
|
||||||
((this.openAi && this.provider.id === "openai") ||
|
((this.openAi && this.provider.id === "openai") ||
|
||||||
(this.gemini && this.provider.id === "gemini") ||
|
(this.gemini && this.provider.id === "gemini") ||
|
||||||
(this.voyage && this.provider.id === "voyage")),
|
(this.voyage && this.provider.id === "voyage")),
|
||||||
@@ -849,7 +862,7 @@ class MemoryManagerSyncOps {
|
|||||||
|
|
||||||
private async activateFallbackProvider(reason: string): Promise<boolean> {
|
private async activateFallbackProvider(reason: string): Promise<boolean> {
|
||||||
const fallback = this.settings.fallback;
|
const fallback = this.settings.fallback;
|
||||||
if (!fallback || fallback === "none" || fallback === this.provider.id) {
|
if (!fallback || fallback === "none" || !this.provider || fallback === this.provider.id) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (this.fallbackFrom) {
|
if (this.fallbackFrom) {
|
||||||
@@ -957,8 +970,8 @@ class MemoryManagerSyncOps {
|
|||||||
}
|
}
|
||||||
|
|
||||||
nextMeta = {
|
nextMeta = {
|
||||||
model: this.provider.model,
|
model: this.provider?.model ?? "fts-only",
|
||||||
provider: this.provider.id,
|
provider: this.provider?.id ?? "none",
|
||||||
providerKey: this.providerKey,
|
providerKey: this.providerKey,
|
||||||
chunkTokens: this.settings.chunking.tokens,
|
chunkTokens: this.settings.chunking.tokens,
|
||||||
chunkOverlap: this.settings.chunking.overlap,
|
chunkOverlap: this.settings.chunking.overlap,
|
||||||
@@ -1023,8 +1036,8 @@ class MemoryManagerSyncOps {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const nextMeta: MemoryIndexMeta = {
|
const nextMeta: MemoryIndexMeta = {
|
||||||
model: this.provider.model,
|
model: this.provider?.model ?? "fts-only",
|
||||||
provider: this.provider.id,
|
provider: this.provider?.id ?? "none",
|
||||||
providerKey: this.providerKey,
|
providerKey: this.providerKey,
|
||||||
chunkTokens: this.settings.chunking.tokens,
|
chunkTokens: this.settings.chunking.tokens,
|
||||||
chunkOverlap: this.settings.chunking.overlap,
|
chunkOverlap: this.settings.chunking.overlap,
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ import { isMemoryPath, normalizeExtraMemoryPaths } from "./internal.js";
|
|||||||
import { memoryManagerEmbeddingOps } from "./manager-embedding-ops.js";
|
import { memoryManagerEmbeddingOps } from "./manager-embedding-ops.js";
|
||||||
import { searchKeyword, searchVector } from "./manager-search.js";
|
import { searchKeyword, searchVector } from "./manager-search.js";
|
||||||
import { memoryManagerSyncOps } from "./manager-sync-ops.js";
|
import { memoryManagerSyncOps } from "./manager-sync-ops.js";
|
||||||
|
import { extractKeywords } from "./query-expansion.js";
|
||||||
const SNIPPET_MAX_CHARS = 700;
|
const SNIPPET_MAX_CHARS = 700;
|
||||||
const VECTOR_TABLE = "chunks_vec";
|
const VECTOR_TABLE = "chunks_vec";
|
||||||
const FTS_TABLE = "chunks_fts";
|
const FTS_TABLE = "chunks_fts";
|
||||||
@@ -233,8 +234,34 @@ export class MemoryIndexManager implements MemorySearchManager {
|
|||||||
log.warn("memory search: no provider and FTS unavailable");
|
log.warn("memory search: no provider and FTS unavailable");
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
const ftsResults = await this.searchKeyword(cleaned, candidates).catch(() => []);
|
|
||||||
return ftsResults.filter((entry) => entry.score >= minScore).slice(0, maxResults);
|
// Extract keywords for better FTS matching on conversational queries
|
||||||
|
// e.g., "that thing we discussed about the API" → ["discussed", "API"]
|
||||||
|
const keywords = extractKeywords(cleaned);
|
||||||
|
const searchTerms = keywords.length > 0 ? keywords : [cleaned];
|
||||||
|
|
||||||
|
// Search with each keyword and merge results
|
||||||
|
const resultSets = await Promise.all(
|
||||||
|
searchTerms.map((term) => this.searchKeyword(term, candidates).catch(() => [])),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Merge and deduplicate results, keeping highest score for each chunk
|
||||||
|
const seenIds = new Map<string, (typeof resultSets)[0][0]>();
|
||||||
|
for (const results of resultSets) {
|
||||||
|
for (const result of results) {
|
||||||
|
const existing = seenIds.get(result.id);
|
||||||
|
if (!existing || result.score > existing.score) {
|
||||||
|
seenIds.set(result.id, result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const merged = [...seenIds.values()]
|
||||||
|
.toSorted((a, b) => b.score - a.score)
|
||||||
|
.filter((entry) => entry.score >= minScore)
|
||||||
|
.slice(0, maxResults);
|
||||||
|
|
||||||
|
return merged;
|
||||||
}
|
}
|
||||||
|
|
||||||
const keywordResults = hybrid.enabled
|
const keywordResults = hybrid.enabled
|
||||||
|
|||||||
78
src/memory/query-expansion.test.ts
Normal file
78
src/memory/query-expansion.test.ts
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
import { expandQueryForFts, extractKeywords } from "./query-expansion.js";
|
||||||
|
|
||||||
|
describe("extractKeywords", () => {
|
||||||
|
it("extracts keywords from English conversational query", () => {
|
||||||
|
const keywords = extractKeywords("that thing we discussed about the API");
|
||||||
|
expect(keywords).toContain("discussed");
|
||||||
|
expect(keywords).toContain("api");
|
||||||
|
// Should not include stop words
|
||||||
|
expect(keywords).not.toContain("that");
|
||||||
|
expect(keywords).not.toContain("thing");
|
||||||
|
expect(keywords).not.toContain("we");
|
||||||
|
expect(keywords).not.toContain("about");
|
||||||
|
expect(keywords).not.toContain("the");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("extracts keywords from Chinese conversational query", () => {
|
||||||
|
const keywords = extractKeywords("之前讨论的那个方案");
|
||||||
|
expect(keywords).toContain("讨论");
|
||||||
|
expect(keywords).toContain("方案");
|
||||||
|
// Should not include stop words
|
||||||
|
expect(keywords).not.toContain("之前");
|
||||||
|
expect(keywords).not.toContain("的");
|
||||||
|
expect(keywords).not.toContain("那个");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("extracts keywords from mixed language query", () => {
|
||||||
|
const keywords = extractKeywords("昨天讨论的 API design");
|
||||||
|
expect(keywords).toContain("讨论");
|
||||||
|
expect(keywords).toContain("api");
|
||||||
|
expect(keywords).toContain("design");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns specific technical terms", () => {
|
||||||
|
const keywords = extractKeywords("what was the solution for the CFR bug");
|
||||||
|
expect(keywords).toContain("solution");
|
||||||
|
expect(keywords).toContain("cfr");
|
||||||
|
expect(keywords).toContain("bug");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("handles empty query", () => {
|
||||||
|
expect(extractKeywords("")).toEqual([]);
|
||||||
|
expect(extractKeywords(" ")).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("handles query with only stop words", () => {
|
||||||
|
const keywords = extractKeywords("the a an is are");
|
||||||
|
expect(keywords.length).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("removes duplicate keywords", () => {
|
||||||
|
const keywords = extractKeywords("test test testing");
|
||||||
|
const testCount = keywords.filter((k) => k === "test").length;
|
||||||
|
expect(testCount).toBe(1);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("expandQueryForFts", () => {
|
||||||
|
it("returns original query and extracted keywords", () => {
|
||||||
|
const result = expandQueryForFts("that API we discussed");
|
||||||
|
expect(result.original).toBe("that API we discussed");
|
||||||
|
expect(result.keywords).toContain("api");
|
||||||
|
expect(result.keywords).toContain("discussed");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("builds expanded OR query for FTS", () => {
|
||||||
|
const result = expandQueryForFts("the solution for bugs");
|
||||||
|
expect(result.expanded).toContain("OR");
|
||||||
|
expect(result.expanded).toContain("solution");
|
||||||
|
expect(result.expanded).toContain("bugs");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns original query when no keywords extracted", () => {
|
||||||
|
const result = expandQueryForFts("the");
|
||||||
|
expect(result.keywords.length).toBe(0);
|
||||||
|
expect(result.expanded).toBe("the");
|
||||||
|
});
|
||||||
|
});
|
||||||
357
src/memory/query-expansion.ts
Normal file
357
src/memory/query-expansion.ts
Normal file
@@ -0,0 +1,357 @@
|
|||||||
|
/**
|
||||||
|
* Query expansion for FTS-only search mode.
|
||||||
|
*
|
||||||
|
* When no embedding provider is available, we fall back to FTS (full-text search).
|
||||||
|
* FTS works best with specific keywords, but users often ask conversational queries
|
||||||
|
* like "that thing we discussed yesterday" or "之前讨论的那个方案".
|
||||||
|
*
|
||||||
|
* This module extracts meaningful keywords from such queries to improve FTS results.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Common stop words that don't add search value
|
||||||
|
const STOP_WORDS_EN = new Set([
|
||||||
|
// Articles and determiners
|
||||||
|
"a",
|
||||||
|
"an",
|
||||||
|
"the",
|
||||||
|
"this",
|
||||||
|
"that",
|
||||||
|
"these",
|
||||||
|
"those",
|
||||||
|
// Pronouns
|
||||||
|
"i",
|
||||||
|
"me",
|
||||||
|
"my",
|
||||||
|
"we",
|
||||||
|
"our",
|
||||||
|
"you",
|
||||||
|
"your",
|
||||||
|
"he",
|
||||||
|
"she",
|
||||||
|
"it",
|
||||||
|
"they",
|
||||||
|
"them",
|
||||||
|
// Common verbs
|
||||||
|
"is",
|
||||||
|
"are",
|
||||||
|
"was",
|
||||||
|
"were",
|
||||||
|
"be",
|
||||||
|
"been",
|
||||||
|
"being",
|
||||||
|
"have",
|
||||||
|
"has",
|
||||||
|
"had",
|
||||||
|
"do",
|
||||||
|
"does",
|
||||||
|
"did",
|
||||||
|
"will",
|
||||||
|
"would",
|
||||||
|
"could",
|
||||||
|
"should",
|
||||||
|
"can",
|
||||||
|
"may",
|
||||||
|
"might",
|
||||||
|
// Prepositions
|
||||||
|
"in",
|
||||||
|
"on",
|
||||||
|
"at",
|
||||||
|
"to",
|
||||||
|
"for",
|
||||||
|
"of",
|
||||||
|
"with",
|
||||||
|
"by",
|
||||||
|
"from",
|
||||||
|
"about",
|
||||||
|
"into",
|
||||||
|
"through",
|
||||||
|
"during",
|
||||||
|
"before",
|
||||||
|
"after",
|
||||||
|
"above",
|
||||||
|
"below",
|
||||||
|
"between",
|
||||||
|
"under",
|
||||||
|
"over",
|
||||||
|
// Conjunctions
|
||||||
|
"and",
|
||||||
|
"or",
|
||||||
|
"but",
|
||||||
|
"if",
|
||||||
|
"then",
|
||||||
|
"because",
|
||||||
|
"as",
|
||||||
|
"while",
|
||||||
|
"when",
|
||||||
|
"where",
|
||||||
|
"what",
|
||||||
|
"which",
|
||||||
|
"who",
|
||||||
|
"how",
|
||||||
|
"why",
|
||||||
|
// Time references (vague, not useful for FTS)
|
||||||
|
"yesterday",
|
||||||
|
"today",
|
||||||
|
"tomorrow",
|
||||||
|
"earlier",
|
||||||
|
"later",
|
||||||
|
"recently",
|
||||||
|
"before",
|
||||||
|
"ago",
|
||||||
|
"just",
|
||||||
|
"now",
|
||||||
|
// Vague references
|
||||||
|
"thing",
|
||||||
|
"things",
|
||||||
|
"stuff",
|
||||||
|
"something",
|
||||||
|
"anything",
|
||||||
|
"everything",
|
||||||
|
"nothing",
|
||||||
|
// Question words
|
||||||
|
"please",
|
||||||
|
"help",
|
||||||
|
"find",
|
||||||
|
"show",
|
||||||
|
"get",
|
||||||
|
"tell",
|
||||||
|
"give",
|
||||||
|
]);
|
||||||
|
|
||||||
|
const STOP_WORDS_ZH = new Set([
|
||||||
|
// Pronouns
|
||||||
|
"我",
|
||||||
|
"我们",
|
||||||
|
"你",
|
||||||
|
"你们",
|
||||||
|
"他",
|
||||||
|
"她",
|
||||||
|
"它",
|
||||||
|
"他们",
|
||||||
|
"这",
|
||||||
|
"那",
|
||||||
|
"这个",
|
||||||
|
"那个",
|
||||||
|
"这些",
|
||||||
|
"那些",
|
||||||
|
// Auxiliary words
|
||||||
|
"的",
|
||||||
|
"了",
|
||||||
|
"着",
|
||||||
|
"过",
|
||||||
|
"得",
|
||||||
|
"地",
|
||||||
|
"吗",
|
||||||
|
"呢",
|
||||||
|
"吧",
|
||||||
|
"啊",
|
||||||
|
"呀",
|
||||||
|
"嘛",
|
||||||
|
"啦",
|
||||||
|
// Verbs (common, vague)
|
||||||
|
"是",
|
||||||
|
"有",
|
||||||
|
"在",
|
||||||
|
"被",
|
||||||
|
"把",
|
||||||
|
"给",
|
||||||
|
"让",
|
||||||
|
"用",
|
||||||
|
"到",
|
||||||
|
"去",
|
||||||
|
"来",
|
||||||
|
"做",
|
||||||
|
"说",
|
||||||
|
"看",
|
||||||
|
"找",
|
||||||
|
"想",
|
||||||
|
"要",
|
||||||
|
"能",
|
||||||
|
"会",
|
||||||
|
"可以",
|
||||||
|
// Prepositions and conjunctions
|
||||||
|
"和",
|
||||||
|
"与",
|
||||||
|
"或",
|
||||||
|
"但",
|
||||||
|
"但是",
|
||||||
|
"因为",
|
||||||
|
"所以",
|
||||||
|
"如果",
|
||||||
|
"虽然",
|
||||||
|
"而",
|
||||||
|
"也",
|
||||||
|
"都",
|
||||||
|
"就",
|
||||||
|
"还",
|
||||||
|
"又",
|
||||||
|
"再",
|
||||||
|
"才",
|
||||||
|
"只",
|
||||||
|
// Time (vague)
|
||||||
|
"之前",
|
||||||
|
"以前",
|
||||||
|
"之后",
|
||||||
|
"以后",
|
||||||
|
"刚才",
|
||||||
|
"现在",
|
||||||
|
"昨天",
|
||||||
|
"今天",
|
||||||
|
"明天",
|
||||||
|
"最近",
|
||||||
|
// Vague references
|
||||||
|
"东西",
|
||||||
|
"事情",
|
||||||
|
"事",
|
||||||
|
"什么",
|
||||||
|
"哪个",
|
||||||
|
"哪些",
|
||||||
|
"怎么",
|
||||||
|
"为什么",
|
||||||
|
"多少",
|
||||||
|
// Question/request words
|
||||||
|
"请",
|
||||||
|
"帮",
|
||||||
|
"帮忙",
|
||||||
|
"告诉",
|
||||||
|
]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if a token looks like a meaningful keyword.
|
||||||
|
* Returns false for short tokens, numbers-only, etc.
|
||||||
|
*/
|
||||||
|
function isValidKeyword(token: string): boolean {
|
||||||
|
if (!token || token.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Skip very short English words (likely stop words or fragments)
|
||||||
|
if (/^[a-zA-Z]+$/.test(token) && token.length < 3) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Skip pure numbers (not useful for semantic search)
|
||||||
|
if (/^\d+$/.test(token)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Skip tokens that are all punctuation
|
||||||
|
if (/^[\p{P}\p{S}]+$/u.test(token)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tokenizer that handles both English and Chinese text.
|
||||||
|
* For Chinese, we do character-based splitting since we don't have a proper segmenter.
|
||||||
|
* For English, we split on whitespace and punctuation.
|
||||||
|
*/
|
||||||
|
function tokenize(text: string): string[] {
|
||||||
|
const tokens: string[] = [];
|
||||||
|
const normalized = text.toLowerCase().trim();
|
||||||
|
|
||||||
|
// Split into segments (English words, Chinese character sequences, etc.)
|
||||||
|
const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean);
|
||||||
|
|
||||||
|
for (const segment of segments) {
|
||||||
|
// Check if segment contains CJK characters
|
||||||
|
if (/[\u4e00-\u9fff]/.test(segment)) {
|
||||||
|
// For Chinese, extract character n-grams (unigrams and bigrams)
|
||||||
|
const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c));
|
||||||
|
// Add individual characters
|
||||||
|
tokens.push(...chars);
|
||||||
|
// Add bigrams for better phrase matching
|
||||||
|
for (let i = 0; i < chars.length - 1; i++) {
|
||||||
|
tokens.push(chars[i] + chars[i + 1]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// For non-CJK, keep as single token
|
||||||
|
tokens.push(segment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract keywords from a conversational query for FTS search.
|
||||||
|
*
|
||||||
|
* Examples:
|
||||||
|
* - "that thing we discussed about the API" → ["discussed", "API"]
|
||||||
|
* - "之前讨论的那个方案" → ["讨论", "方案"]
|
||||||
|
* - "what was the solution for the bug" → ["solution", "bug"]
|
||||||
|
*/
|
||||||
|
export function extractKeywords(query: string): string[] {
|
||||||
|
const tokens = tokenize(query);
|
||||||
|
const keywords: string[] = [];
|
||||||
|
const seen = new Set<string>();
|
||||||
|
|
||||||
|
for (const token of tokens) {
|
||||||
|
// Skip stop words
|
||||||
|
if (STOP_WORDS_EN.has(token) || STOP_WORDS_ZH.has(token)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Skip invalid keywords
|
||||||
|
if (!isValidKeyword(token)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Skip duplicates
|
||||||
|
if (seen.has(token)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
seen.add(token);
|
||||||
|
keywords.push(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
return keywords;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expand a query for FTS search.
|
||||||
|
* Returns both the original query and extracted keywords for OR-matching.
|
||||||
|
*
|
||||||
|
* @param query - User's original query
|
||||||
|
* @returns Object with original query and extracted keywords
|
||||||
|
*/
|
||||||
|
export function expandQueryForFts(query: string): {
|
||||||
|
original: string;
|
||||||
|
keywords: string[];
|
||||||
|
expanded: string;
|
||||||
|
} {
|
||||||
|
const original = query.trim();
|
||||||
|
const keywords = extractKeywords(original);
|
||||||
|
|
||||||
|
// Build expanded query: original terms OR extracted keywords
|
||||||
|
// This ensures both exact matches and keyword matches are found
|
||||||
|
const expanded = keywords.length > 0 ? `${original} OR ${keywords.join(" OR ")}` : original;
|
||||||
|
|
||||||
|
return { original, keywords, expanded };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Type for an optional LLM-based query expander.
|
||||||
|
* Can be provided to enhance keyword extraction with semantic understanding.
|
||||||
|
*/
|
||||||
|
export type LlmQueryExpander = (query: string) => Promise<string[]>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expand query with optional LLM assistance.
|
||||||
|
* Falls back to local extraction if LLM is unavailable or fails.
|
||||||
|
*/
|
||||||
|
export async function expandQueryWithLlm(
|
||||||
|
query: string,
|
||||||
|
llmExpander?: LlmQueryExpander,
|
||||||
|
): Promise<string[]> {
|
||||||
|
// If LLM expander is provided, try it first
|
||||||
|
if (llmExpander) {
|
||||||
|
try {
|
||||||
|
const llmKeywords = await llmExpander(query);
|
||||||
|
if (llmKeywords.length > 0) {
|
||||||
|
return llmKeywords;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// LLM failed, fall back to local extraction
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to local keyword extraction
|
||||||
|
return extractKeywords(query);
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user