perf(test): cover embedding chunk limits without indexing

This commit is contained in:
Peter Steinberger
2026-02-15 13:44:41 +00:00
parent e3f4cabf49
commit a4b958efcd
2 changed files with 52 additions and 60 deletions

View File

@@ -0,0 +1,52 @@
import { describe, expect, it } from "vitest";
import type { EmbeddingProvider } from "./embeddings.js";
import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js";
import { estimateUtf8Bytes } from "./embedding-input-limits.js";
function createProvider(maxInputTokens: number): EmbeddingProvider {
return {
id: "mock",
model: "mock-embed",
maxInputTokens,
embedQuery: async () => [0],
embedBatch: async () => [[0]],
};
}
describe("embedding chunk limits", () => {
it("splits oversized chunks so each embedding input stays <= maxInputTokens bytes", () => {
const provider = createProvider(8192);
const input = {
startLine: 1,
endLine: 1,
text: "x".repeat(9000),
hash: "ignored",
};
const out = enforceEmbeddingMaxInputTokens(provider, [input]);
expect(out.length).toBeGreaterThan(1);
expect(out.map((chunk) => chunk.text).join("")).toBe(input.text);
expect(out.every((chunk) => estimateUtf8Bytes(chunk.text) <= 8192)).toBe(true);
expect(out.every((chunk) => chunk.startLine === 1 && chunk.endLine === 1)).toBe(true);
expect(out.every((chunk) => typeof chunk.hash === "string" && chunk.hash.length > 0)).toBe(
true,
);
});
it("does not split inside surrogate pairs (emoji)", () => {
const provider = createProvider(8192);
const emoji = "😀";
const inputText = `${emoji.repeat(2100)}\n${emoji.repeat(2100)}`;
const out = enforceEmbeddingMaxInputTokens(provider, [
{ startLine: 1, endLine: 2, text: inputText, hash: "ignored" },
]);
expect(out.length).toBeGreaterThan(1);
expect(out.map((chunk) => chunk.text).join("")).toBe(inputText);
expect(out.every((chunk) => estimateUtf8Bytes(chunk.text) <= 8192)).toBe(true);
// If we split inside surrogate pairs we'd likely end up with replacement chars.
expect(out.map((chunk) => chunk.text).join("")).not.toContain("\uFFFD");
});
});

View File

@@ -1,60 +0,0 @@
import fs from "node:fs/promises";
import path from "node:path";
import { describe, expect, it } from "vitest";
import { installEmbeddingManagerFixture } from "./embedding-manager.test-harness.js";
const fx = installEmbeddingManagerFixture({
fixturePrefix: "openclaw-mem-token-",
largeTokens: 10_000,
smallTokens: 1000,
createCfg: ({ workspaceDir, indexPath, tokens }) => ({
agents: {
defaults: {
workspace: workspaceDir,
memorySearch: {
provider: "openai",
model: "mock-embed",
store: { path: indexPath, vector: { enabled: false } },
chunking: { tokens, overlap: 0 },
sync: { watch: false, onSessionStart: false, onSearch: false },
query: { minScore: 0 },
},
},
list: [{ id: "main", default: true }],
},
}),
});
const { embedBatch } = fx;
describe("memory embedding token limits", () => {
it("splits oversized chunks so each embedding input stays <= 8192 UTF-8 bytes", async () => {
const memoryDir = fx.getMemoryDir();
const managerLarge = fx.getManagerLarge();
const content = "x".repeat(9500);
await fs.writeFile(path.join(memoryDir, "2026-01-09.md"), content);
await managerLarge.sync({ reason: "test" });
const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
expect(inputs.length).toBeGreaterThan(1);
expect(
Math.max(...inputs.map((input) => Buffer.byteLength(input, "utf8"))),
).toBeLessThanOrEqual(8192);
});
it("uses UTF-8 byte estimates when batching multibyte chunks", async () => {
const memoryDir = fx.getMemoryDir();
const managerSmall = fx.getManagerSmall();
const line = "😀".repeat(1800);
const content = `${line}\n${line}\n${line}`;
await fs.writeFile(path.join(memoryDir, "2026-01-10.md"), content);
await managerSmall.sync({ reason: "test" });
const batchSizes = embedBatch.mock.calls.map(
(call) => (call[0] as string[] | undefined)?.length ?? 0,
);
expect(batchSizes.length).toBe(3);
expect(batchSizes.every((size) => size === 1)).toBe(true);
const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
expect(inputs.every((input) => Buffer.byteLength(input, "utf8") <= 8192)).toBe(true);
});
});