mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-18 14:07:27 +00:00
perf(test): cover embedding chunk limits without indexing
This commit is contained in:
52
src/memory/embedding-chunk-limits.test.ts
Normal file
52
src/memory/embedding-chunk-limits.test.ts
Normal file
@@ -0,0 +1,52 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { EmbeddingProvider } from "./embeddings.js";
|
||||
import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js";
|
||||
import { estimateUtf8Bytes } from "./embedding-input-limits.js";
|
||||
|
||||
function createProvider(maxInputTokens: number): EmbeddingProvider {
|
||||
return {
|
||||
id: "mock",
|
||||
model: "mock-embed",
|
||||
maxInputTokens,
|
||||
embedQuery: async () => [0],
|
||||
embedBatch: async () => [[0]],
|
||||
};
|
||||
}
|
||||
|
||||
describe("embedding chunk limits", () => {
|
||||
it("splits oversized chunks so each embedding input stays <= maxInputTokens bytes", () => {
|
||||
const provider = createProvider(8192);
|
||||
const input = {
|
||||
startLine: 1,
|
||||
endLine: 1,
|
||||
text: "x".repeat(9000),
|
||||
hash: "ignored",
|
||||
};
|
||||
|
||||
const out = enforceEmbeddingMaxInputTokens(provider, [input]);
|
||||
expect(out.length).toBeGreaterThan(1);
|
||||
expect(out.map((chunk) => chunk.text).join("")).toBe(input.text);
|
||||
expect(out.every((chunk) => estimateUtf8Bytes(chunk.text) <= 8192)).toBe(true);
|
||||
expect(out.every((chunk) => chunk.startLine === 1 && chunk.endLine === 1)).toBe(true);
|
||||
expect(out.every((chunk) => typeof chunk.hash === "string" && chunk.hash.length > 0)).toBe(
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
it("does not split inside surrogate pairs (emoji)", () => {
|
||||
const provider = createProvider(8192);
|
||||
const emoji = "😀";
|
||||
const inputText = `${emoji.repeat(2100)}\n${emoji.repeat(2100)}`;
|
||||
|
||||
const out = enforceEmbeddingMaxInputTokens(provider, [
|
||||
{ startLine: 1, endLine: 2, text: inputText, hash: "ignored" },
|
||||
]);
|
||||
|
||||
expect(out.length).toBeGreaterThan(1);
|
||||
expect(out.map((chunk) => chunk.text).join("")).toBe(inputText);
|
||||
expect(out.every((chunk) => estimateUtf8Bytes(chunk.text) <= 8192)).toBe(true);
|
||||
|
||||
// If we split inside surrogate pairs we'd likely end up with replacement chars.
|
||||
expect(out.map((chunk) => chunk.text).join("")).not.toContain("\uFFFD");
|
||||
});
|
||||
});
|
||||
@@ -1,60 +0,0 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { installEmbeddingManagerFixture } from "./embedding-manager.test-harness.js";
|
||||
|
||||
const fx = installEmbeddingManagerFixture({
|
||||
fixturePrefix: "openclaw-mem-token-",
|
||||
largeTokens: 10_000,
|
||||
smallTokens: 1000,
|
||||
createCfg: ({ workspaceDir, indexPath, tokens }) => ({
|
||||
agents: {
|
||||
defaults: {
|
||||
workspace: workspaceDir,
|
||||
memorySearch: {
|
||||
provider: "openai",
|
||||
model: "mock-embed",
|
||||
store: { path: indexPath, vector: { enabled: false } },
|
||||
chunking: { tokens, overlap: 0 },
|
||||
sync: { watch: false, onSessionStart: false, onSearch: false },
|
||||
query: { minScore: 0 },
|
||||
},
|
||||
},
|
||||
list: [{ id: "main", default: true }],
|
||||
},
|
||||
}),
|
||||
});
|
||||
const { embedBatch } = fx;
|
||||
|
||||
describe("memory embedding token limits", () => {
|
||||
it("splits oversized chunks so each embedding input stays <= 8192 UTF-8 bytes", async () => {
|
||||
const memoryDir = fx.getMemoryDir();
|
||||
const managerLarge = fx.getManagerLarge();
|
||||
const content = "x".repeat(9500);
|
||||
await fs.writeFile(path.join(memoryDir, "2026-01-09.md"), content);
|
||||
await managerLarge.sync({ reason: "test" });
|
||||
|
||||
const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
|
||||
expect(inputs.length).toBeGreaterThan(1);
|
||||
expect(
|
||||
Math.max(...inputs.map((input) => Buffer.byteLength(input, "utf8"))),
|
||||
).toBeLessThanOrEqual(8192);
|
||||
});
|
||||
|
||||
it("uses UTF-8 byte estimates when batching multibyte chunks", async () => {
|
||||
const memoryDir = fx.getMemoryDir();
|
||||
const managerSmall = fx.getManagerSmall();
|
||||
const line = "😀".repeat(1800);
|
||||
const content = `${line}\n${line}\n${line}`;
|
||||
await fs.writeFile(path.join(memoryDir, "2026-01-10.md"), content);
|
||||
await managerSmall.sync({ reason: "test" });
|
||||
|
||||
const batchSizes = embedBatch.mock.calls.map(
|
||||
(call) => (call[0] as string[] | undefined)?.length ?? 0,
|
||||
);
|
||||
expect(batchSizes.length).toBe(3);
|
||||
expect(batchSizes.every((size) => size === 1)).toBe(true);
|
||||
const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
|
||||
expect(inputs.every((input) => Buffer.byteLength(input, "utf8") <= 8192)).toBe(true);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user