perf(test): cover embedding chunk limits without indexing

2026-04-18 14:07:27 +00:00 · 2026-02-15 13:44:41 +00:00
parent e3f4cabf49
commit a4b958efcd
2 changed files with 52 additions and 60 deletions
--- a/src/memory/embedding-chunk-limits.test.ts
+++ b/src/memory/embedding-chunk-limits.test.ts
@@ -0,0 +1,52 @@
+import { describe, expect, it } from "vitest";
+import type { EmbeddingProvider } from "./embeddings.js";
+import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js";
+import { estimateUtf8Bytes } from "./embedding-input-limits.js";
+
+function createProvider(maxInputTokens: number): EmbeddingProvider {
+  return {
+    id: "mock",
+    model: "mock-embed",
+    maxInputTokens,
+    embedQuery: async () => [0],
+    embedBatch: async () => [[0]],
+  };
+}
+
+describe("embedding chunk limits", () => {
+  it("splits oversized chunks so each embedding input stays <= maxInputTokens bytes", () => {
+    const provider = createProvider(8192);
+    const input = {
+      startLine: 1,
+      endLine: 1,
+      text: "x".repeat(9000),
+      hash: "ignored",
+    };
+
+    const out = enforceEmbeddingMaxInputTokens(provider, [input]);
+    expect(out.length).toBeGreaterThan(1);
+    expect(out.map((chunk) => chunk.text).join("")).toBe(input.text);
+    expect(out.every((chunk) => estimateUtf8Bytes(chunk.text) <= 8192)).toBe(true);
+    expect(out.every((chunk) => chunk.startLine === 1 && chunk.endLine === 1)).toBe(true);
+    expect(out.every((chunk) => typeof chunk.hash === "string" && chunk.hash.length > 0)).toBe(
+      true,
+    );
+  });
+
+  it("does not split inside surrogate pairs (emoji)", () => {
+    const provider = createProvider(8192);
+    const emoji = "😀";
+    const inputText = `${emoji.repeat(2100)}\n${emoji.repeat(2100)}`;
+
+    const out = enforceEmbeddingMaxInputTokens(provider, [
+      { startLine: 1, endLine: 2, text: inputText, hash: "ignored" },
+    ]);
+
+    expect(out.length).toBeGreaterThan(1);
+    expect(out.map((chunk) => chunk.text).join("")).toBe(inputText);
+    expect(out.every((chunk) => estimateUtf8Bytes(chunk.text) <= 8192)).toBe(true);
+
+    // If we split inside surrogate pairs we'd likely end up with replacement chars.
+    expect(out.map((chunk) => chunk.text).join("")).not.toContain("\uFFFD");
+  });
+});
--- a/src/memory/manager.embedding-token-limit.test.ts
+++ b/src/memory/manager.embedding-token-limit.test.ts
@@ -1,60 +0,0 @@
-import fs from "node:fs/promises";
-import path from "node:path";
-import { describe, expect, it } from "vitest";
-import { installEmbeddingManagerFixture } from "./embedding-manager.test-harness.js";
-
-const fx = installEmbeddingManagerFixture({
-  fixturePrefix: "openclaw-mem-token-",
-  largeTokens: 10_000,
-  smallTokens: 1000,
-  createCfg: ({ workspaceDir, indexPath, tokens }) => ({
-    agents: {
-      defaults: {
-        workspace: workspaceDir,
-        memorySearch: {
-          provider: "openai",
-          model: "mock-embed",
-          store: { path: indexPath, vector: { enabled: false } },
-          chunking: { tokens, overlap: 0 },
-          sync: { watch: false, onSessionStart: false, onSearch: false },
-          query: { minScore: 0 },
-        },
-      },
-      list: [{ id: "main", default: true }],
-    },
-  }),
-});
-const { embedBatch } = fx;
-
-describe("memory embedding token limits", () => {
-  it("splits oversized chunks so each embedding input stays <= 8192 UTF-8 bytes", async () => {
-    const memoryDir = fx.getMemoryDir();
-    const managerLarge = fx.getManagerLarge();
-    const content = "x".repeat(9500);
-    await fs.writeFile(path.join(memoryDir, "2026-01-09.md"), content);
-    await managerLarge.sync({ reason: "test" });
-
-    const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
-    expect(inputs.length).toBeGreaterThan(1);
-    expect(
-      Math.max(...inputs.map((input) => Buffer.byteLength(input, "utf8"))),
-    ).toBeLessThanOrEqual(8192);
-  });
-
-  it("uses UTF-8 byte estimates when batching multibyte chunks", async () => {
-    const memoryDir = fx.getMemoryDir();
-    const managerSmall = fx.getManagerSmall();
-    const line = "😀".repeat(1800);
-    const content = `${line}\n${line}\n${line}`;
-    await fs.writeFile(path.join(memoryDir, "2026-01-10.md"), content);
-    await managerSmall.sync({ reason: "test" });
-
-    const batchSizes = embedBatch.mock.calls.map(
-      (call) => (call[0] as string[] | undefined)?.length ?? 0,
-    );
-    expect(batchSizes.length).toBe(3);
-    expect(batchSizes.every((size) => size === 1)).toBe(true);
-    const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []);
-    expect(inputs.every((input) => Buffer.byteLength(input, "utf8") <= 8192)).toBe(true);
-  });
-});