mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-09 19:04:31 +00:00
Memory: add Arabic query expansion stop words (#23717)
This commit is contained in:
@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
|
|||||||
- Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang.
|
- Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang.
|
||||||
- Memory/FTS: add Japanese-aware query expansion tokenization and stop-word filtering (including mixed-script terms like ASCII + katakana) for FTS-only search mode. Thanks @vincentkoc.
|
- Memory/FTS: add Japanese-aware query expansion tokenization and stop-word filtering (including mixed-script terms like ASCII + katakana) for FTS-only search mode. Thanks @vincentkoc.
|
||||||
- Memory/FTS: add Spanish and Portuguese stop-word filtering for query expansion in FTS-only search mode, improving conversational recall for both languages. Thanks @vincentkoc.
|
- Memory/FTS: add Spanish and Portuguese stop-word filtering for query expansion in FTS-only search mode, improving conversational recall for both languages. Thanks @vincentkoc.
|
||||||
|
- Memory/FTS: add Arabic stop-word filtering for query expansion in FTS-only search mode to reduce conversational filler in Arabic memory searches. Thanks @vincentkoc.
|
||||||
- iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman.
|
- iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman.
|
||||||
|
|
||||||
### Breaking
|
### Breaking
|
||||||
|
|||||||
@@ -143,6 +143,22 @@ describe("extractKeywords", () => {
|
|||||||
expect(keywords).not.toContain("onde");
|
expect(keywords).not.toContain("onde");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("extracts keywords from Arabic conversational query", () => {
|
||||||
|
const keywords = extractKeywords("بالأمس ناقشنا استراتيجية النشر");
|
||||||
|
expect(keywords).toContain("ناقشنا");
|
||||||
|
expect(keywords).toContain("استراتيجية");
|
||||||
|
expect(keywords).toContain("النشر");
|
||||||
|
expect(keywords).not.toContain("بالأمس");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("filters Arabic question stop words", () => {
|
||||||
|
const keywords = extractKeywords("كيف متى أين ماذا");
|
||||||
|
expect(keywords).not.toContain("كيف");
|
||||||
|
expect(keywords).not.toContain("متى");
|
||||||
|
expect(keywords).not.toContain("أين");
|
||||||
|
expect(keywords).not.toContain("ماذا");
|
||||||
|
});
|
||||||
|
|
||||||
it("handles empty query", () => {
|
it("handles empty query", () => {
|
||||||
expect(extractKeywords("")).toEqual([]);
|
expect(extractKeywords("")).toEqual([]);
|
||||||
expect(extractKeywords(" ")).toEqual([]);
|
expect(extractKeywords(" ")).toEqual([]);
|
||||||
|
|||||||
@@ -262,6 +262,68 @@ const STOP_WORDS_PT = new Set([
|
|||||||
"ajuda",
|
"ajuda",
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
const STOP_WORDS_AR = new Set([
|
||||||
|
// Articles and connectors
|
||||||
|
"ال",
|
||||||
|
"و",
|
||||||
|
"أو",
|
||||||
|
"لكن",
|
||||||
|
"ثم",
|
||||||
|
"بل",
|
||||||
|
// Pronouns / references
|
||||||
|
"أنا",
|
||||||
|
"نحن",
|
||||||
|
"هو",
|
||||||
|
"هي",
|
||||||
|
"هم",
|
||||||
|
"هذا",
|
||||||
|
"هذه",
|
||||||
|
"ذلك",
|
||||||
|
"تلك",
|
||||||
|
"هنا",
|
||||||
|
"هناك",
|
||||||
|
// Common prepositions
|
||||||
|
"من",
|
||||||
|
"إلى",
|
||||||
|
"الى",
|
||||||
|
"في",
|
||||||
|
"على",
|
||||||
|
"عن",
|
||||||
|
"مع",
|
||||||
|
"بين",
|
||||||
|
"ل",
|
||||||
|
"ب",
|
||||||
|
"ك",
|
||||||
|
// Common auxiliaries / vague verbs
|
||||||
|
"كان",
|
||||||
|
"كانت",
|
||||||
|
"يكون",
|
||||||
|
"تكون",
|
||||||
|
"صار",
|
||||||
|
"أصبح",
|
||||||
|
"يمكن",
|
||||||
|
"ممكن",
|
||||||
|
// Time references (vague)
|
||||||
|
"بالأمس",
|
||||||
|
"امس",
|
||||||
|
"اليوم",
|
||||||
|
"غدا",
|
||||||
|
"الآن",
|
||||||
|
"قبل",
|
||||||
|
"بعد",
|
||||||
|
"مؤخرا",
|
||||||
|
// Question/request words
|
||||||
|
"لماذا",
|
||||||
|
"كيف",
|
||||||
|
"ماذا",
|
||||||
|
"متى",
|
||||||
|
"أين",
|
||||||
|
"هل",
|
||||||
|
"من فضلك",
|
||||||
|
"فضلا",
|
||||||
|
"ساعد",
|
||||||
|
]);
|
||||||
|
|
||||||
const STOP_WORDS_KO = new Set([
|
const STOP_WORDS_KO = new Set([
|
||||||
// Particles (조사)
|
// Particles (조사)
|
||||||
"은",
|
"은",
|
||||||
@@ -669,6 +731,7 @@ export function extractKeywords(query: string): string[] {
|
|||||||
STOP_WORDS_EN.has(token) ||
|
STOP_WORDS_EN.has(token) ||
|
||||||
STOP_WORDS_ES.has(token) ||
|
STOP_WORDS_ES.has(token) ||
|
||||||
STOP_WORDS_PT.has(token) ||
|
STOP_WORDS_PT.has(token) ||
|
||||||
|
STOP_WORDS_AR.has(token) ||
|
||||||
STOP_WORDS_ZH.has(token) ||
|
STOP_WORDS_ZH.has(token) ||
|
||||||
STOP_WORDS_KO.has(token) ||
|
STOP_WORDS_KO.has(token) ||
|
||||||
STOP_WORDS_JA.has(token)
|
STOP_WORDS_JA.has(token)
|
||||||
|
|||||||
Reference in New Issue
Block a user