mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-10 08:12:43 +00:00
feat: add Korean language support for memory search query expansion (#18899)
* feat: add Korean stop words and tokenization for memory search * fix: address review comments on Korean query expansion * fix: lint errors - curly brace and toSorted * fix(memory): improve Korean stop words and deduplicate * Memory: tighten Korean query expansion filtering * Docs/Changelog: credit Korean memory query expansion --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
@@ -38,6 +38,63 @@ describe("extractKeywords", () => {
|
||||
expect(keywords).toContain("bug");
|
||||
});
|
||||
|
||||
it("extracts keywords from Korean conversational query", () => {
|
||||
const keywords = extractKeywords("어제 논의한 배포 전략");
|
||||
expect(keywords).toContain("논의한");
|
||||
expect(keywords).toContain("배포");
|
||||
expect(keywords).toContain("전략");
|
||||
// Should not include stop words
|
||||
expect(keywords).not.toContain("어제");
|
||||
});
|
||||
|
||||
it("strips Korean particles to extract stems", () => {
|
||||
const keywords = extractKeywords("서버에서 발생한 에러를 확인");
|
||||
expect(keywords).toContain("서버");
|
||||
expect(keywords).toContain("에러");
|
||||
expect(keywords).toContain("확인");
|
||||
});
|
||||
|
||||
it("filters Korean stop words including inflected forms", () => {
|
||||
const keywords = extractKeywords("나는 그리고 그래서");
|
||||
expect(keywords).not.toContain("나");
|
||||
expect(keywords).not.toContain("나는");
|
||||
expect(keywords).not.toContain("그리고");
|
||||
expect(keywords).not.toContain("그래서");
|
||||
});
|
||||
|
||||
it("filters inflected Korean stop words not explicitly listed", () => {
|
||||
const keywords = extractKeywords("그녀는 우리는");
|
||||
expect(keywords).not.toContain("그녀는");
|
||||
expect(keywords).not.toContain("우리는");
|
||||
expect(keywords).not.toContain("그녀");
|
||||
expect(keywords).not.toContain("우리");
|
||||
});
|
||||
|
||||
it("does not produce bogus single-char stems from particle stripping", () => {
|
||||
const keywords = extractKeywords("논의");
|
||||
expect(keywords).toContain("논의");
|
||||
expect(keywords).not.toContain("논");
|
||||
});
|
||||
|
||||
it("strips longest Korean trailing particles first", () => {
|
||||
const keywords = extractKeywords("기능으로 설명");
|
||||
expect(keywords).toContain("기능");
|
||||
expect(keywords).not.toContain("기능으");
|
||||
});
|
||||
|
||||
it("keeps stripped ASCII stems for mixed Korean tokens", () => {
|
||||
const keywords = extractKeywords("API를 배포했다");
|
||||
expect(keywords).toContain("api");
|
||||
expect(keywords).toContain("배포했다");
|
||||
});
|
||||
|
||||
it("handles mixed Korean and English query", () => {
|
||||
const keywords = extractKeywords("API 배포에 대한 논의");
|
||||
expect(keywords).toContain("api");
|
||||
expect(keywords).toContain("배포");
|
||||
expect(keywords).toContain("논의");
|
||||
});
|
||||
|
||||
it("handles empty query", () => {
|
||||
expect(extractKeywords("")).toEqual([]);
|
||||
expect(extractKeywords(" ")).toEqual([]);
|
||||
|
||||
Reference in New Issue
Block a user