perf: harden chunking against quadratic scans

This commit is contained in:
Peter Steinberger
2026-03-07 16:50:15 +00:00
parent b393b9e8ff
commit 74912037dc
7 changed files with 324 additions and 139 deletions

View File

@@ -1,4 +1,5 @@
import { describe, expect, it } from "vitest";
import { describe, expect, it, vi } from "vitest";
import * as fences from "../markdown/fences.js";
import { hasBalancedFences } from "../test-utils/chunk-test-helpers.js";
import {
chunkByNewline,
@@ -217,6 +218,17 @@ describe("chunkMarkdownText", () => {
expect(chunks[0]?.length).toBe(20);
expect(chunks.join("")).toBe(text);
});
it("parses fence spans once for long fenced payloads", () => {
const parseSpy = vi.spyOn(fences, "parseFenceSpans");
const text = `\`\`\`txt\n${"line\n".repeat(600)}\`\`\``;
const chunks = chunkMarkdownText(text, 80);
expect(chunks.length).toBeGreaterThan(2);
expect(parseSpy).toHaveBeenCalledTimes(1);
parseSpy.mockRestore();
});
});
describe("chunkByNewline", () => {

View File

@@ -306,7 +306,7 @@ export function chunkText(text: string, limit: number): string[] {
}
return chunkTextByBreakResolver(text, limit, (window) => {
// 1) Prefer a newline break inside the window (outside parentheses).
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(window);
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(window, 0, window.length);
// 2) Otherwise prefer the last whitespace (word boundary) inside the window.
return lastNewline > 0 ? lastNewline : lastWhitespace;
});
@@ -319,14 +319,24 @@ export function chunkMarkdownText(text: string, limit: number): string[] {
}
const chunks: string[] = [];
let remaining = text;
const spans = parseFenceSpans(text);
let start = 0;
let reopenFence: ReturnType<typeof findFenceSpanAt> | undefined;
while (remaining.length > limit) {
const spans = parseFenceSpans(remaining);
const window = remaining.slice(0, limit);
while (start < text.length) {
const reopenPrefix = reopenFence ? `${reopenFence.openLine}\n` : "";
const contentLimit = Math.max(1, limit - reopenPrefix.length);
if (text.length - start <= contentLimit) {
const finalChunk = `${reopenPrefix}${text.slice(start)}`;
if (finalChunk.length > 0) {
chunks.push(finalChunk);
}
break;
}
const softBreak = pickSafeBreakIndex(window, spans);
let breakIdx = softBreak > 0 ? softBreak : limit;
const windowEnd = Math.min(text.length, start + contentLimit);
const softBreak = pickSafeBreakIndex(text, start, windowEnd, spans);
let breakIdx = softBreak > start ? softBreak : windowEnd;
const initialFence = isSafeFenceBreak(spans, breakIdx)
? undefined
@@ -335,38 +345,38 @@ export function chunkMarkdownText(text: string, limit: number): string[] {
let fenceToSplit = initialFence;
if (initialFence) {
const closeLine = `${initialFence.indent}${initialFence.marker}`;
const maxIdxIfNeedNewline = limit - (closeLine.length + 1);
const maxIdxIfNeedNewline = start + (contentLimit - (closeLine.length + 1));
if (maxIdxIfNeedNewline <= 0) {
if (maxIdxIfNeedNewline <= start) {
fenceToSplit = undefined;
breakIdx = limit;
breakIdx = windowEnd;
} else {
const minProgressIdx = Math.min(
remaining.length,
initialFence.start + initialFence.openLine.length + 2,
text.length,
Math.max(start + 1, initialFence.start + initialFence.openLine.length + 2),
);
const maxIdxIfAlreadyNewline = limit - closeLine.length;
const maxIdxIfAlreadyNewline = start + (contentLimit - closeLine.length);
let pickedNewline = false;
let lastNewline = remaining.lastIndexOf("\n", Math.max(0, maxIdxIfAlreadyNewline - 1));
while (lastNewline !== -1) {
let lastNewline = text.lastIndexOf("\n", Math.max(start, maxIdxIfAlreadyNewline - 1));
while (lastNewline >= start) {
const candidateBreak = lastNewline + 1;
if (candidateBreak < minProgressIdx) {
break;
}
const candidateFence = findFenceSpanAt(spans, candidateBreak);
if (candidateFence && candidateFence.start === initialFence.start) {
breakIdx = Math.max(1, candidateBreak);
breakIdx = candidateBreak;
pickedNewline = true;
break;
}
lastNewline = remaining.lastIndexOf("\n", lastNewline - 1);
lastNewline = text.lastIndexOf("\n", lastNewline - 1);
}
if (!pickedNewline) {
if (minProgressIdx > maxIdxIfAlreadyNewline) {
fenceToSplit = undefined;
breakIdx = limit;
breakIdx = windowEnd;
} else {
breakIdx = Math.max(minProgressIdx, maxIdxIfNeedNewline);
}
@@ -378,68 +388,72 @@ export function chunkMarkdownText(text: string, limit: number): string[] {
fenceAtBreak && fenceAtBreak.start === initialFence.start ? fenceAtBreak : undefined;
}
let rawChunk = remaining.slice(0, breakIdx);
if (!rawChunk) {
const rawContent = text.slice(start, breakIdx);
if (!rawContent) {
break;
}
const brokeOnSeparator = breakIdx < remaining.length && /\s/.test(remaining[breakIdx]);
const nextStart = Math.min(remaining.length, breakIdx + (brokeOnSeparator ? 1 : 0));
let next = remaining.slice(nextStart);
let rawChunk = `${reopenPrefix}${rawContent}`;
const brokeOnSeparator = breakIdx < text.length && /\s/.test(text[breakIdx]);
let nextStart = Math.min(text.length, breakIdx + (brokeOnSeparator ? 1 : 0));
if (fenceToSplit) {
const closeLine = `${fenceToSplit.indent}${fenceToSplit.marker}`;
rawChunk = rawChunk.endsWith("\n") ? `${rawChunk}${closeLine}` : `${rawChunk}\n${closeLine}`;
next = `${fenceToSplit.openLine}\n${next}`;
reopenFence = fenceToSplit;
} else {
next = stripLeadingNewlines(next);
nextStart = skipLeadingNewlines(text, nextStart);
reopenFence = undefined;
}
chunks.push(rawChunk);
remaining = next;
}
if (remaining.length) {
chunks.push(remaining);
start = nextStart;
}
return chunks;
}
function stripLeadingNewlines(value: string): string {
let i = 0;
function skipLeadingNewlines(value: string, start = 0): number {
let i = start;
while (i < value.length && value[i] === "\n") {
i++;
}
return i > 0 ? value.slice(i) : value;
return i;
}
function pickSafeBreakIndex(window: string, spans: ReturnType<typeof parseFenceSpans>): number {
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(window, (index) =>
function pickSafeBreakIndex(
text: string,
start: number,
end: number,
spans: ReturnType<typeof parseFenceSpans>,
): number {
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(text, start, end, (index) =>
isSafeFenceBreak(spans, index),
);
if (lastNewline > 0) {
if (lastNewline > start) {
return lastNewline;
}
if (lastWhitespace > 0) {
if (lastWhitespace > start) {
return lastWhitespace;
}
return -1;
}
function scanParenAwareBreakpoints(
window: string,
text: string,
start: number,
end: number,
isAllowed: (index: number) => boolean = () => true,
): { lastNewline: number; lastWhitespace: number } {
let lastNewline = -1;
let lastWhitespace = -1;
let depth = 0;
for (let i = 0; i < window.length; i++) {
for (let i = start; i < end; i++) {
if (!isAllowed(i)) {
continue;
}
const char = window[i];
const char = text[i];
if (char === "(") {
depth += 1;
continue;