fix: prevent compaction "prompt too long" errors (#22921)

* includes: prompt overhead in compaction safeguard calculation. Subtracts SUMMARIZATION_OVERHEAD_TOKENS from maxChunkTokens in both the main summarization path and the dropped-messages summarization path. This ensures the chunk budget leaves room for the prompt overhead that generateSummary wraps around each chunk. * adds: budget for overhead tokens to use an effectiveMax instead of maxTokens naïvely. - Added `SUMMARIZATION_OVERHEAD_TOKENS = 4096` — a budget for the tokens that `generateSummary` adds on top of the serialized conversation (system prompt, `<conversation>` tags, summarization instructions, `<previous-summary>` block, and reasoning: "high" thinking budget). - `chunkMessagesByMaxTokens` now divides `maxTokens` by `SAFETY_MARGIN` (1.2) before comparing against estimated token counts. Previously, the safety margin was only used in `computeAdaptiveChunkRatio` and `isOversizedForSummary` but not in the actual chunking loop — so chunks could be built that fit the estimated budget but exceeded the real budget once the API tokenized them properly.
2026-05-07 23:31:24 +00:00 · 2026-02-21 14:42:18 -06:00
parent ac633366ce
commit b703ea3675
2 changed files with 21 additions and 5 deletions
--- a/src/agents/compaction.ts
+++ b/src/agents/compaction.ts
@@ -68,6 +68,11 @@ export function splitMessagesByTokenShare(
  return chunks;
 }

+// Overhead reserved for summarization prompt, system prompt, previous summary,
+// and serialization wrappers (<conversation> tags, instructions, etc.).
+// generateSummary uses reasoning: "high" which also consumes context budget.
+export const SUMMARIZATION_OVERHEAD_TOKENS = 4096;
+
 export function chunkMessagesByMaxTokens(
  messages: AgentMessage[],
  maxTokens: number,
@@ -76,13 +81,17 @@ export function chunkMessagesByMaxTokens(
    return [];
  }

+  // Apply safety margin to compensate for estimateTokens() underestimation
+  // (chars/4 heuristic misses multi-byte chars, special tokens, code tokens, etc.)
+  const effectiveMax = Math.max(1, Math.floor(maxTokens / SAFETY_MARGIN));
+
  const chunks: AgentMessage[][] = [];
  let currentChunk: AgentMessage[] = [];
  let currentTokens = 0;

  for (const message of messages) {
    const messageTokens = estimateTokens(message);
-    if (currentChunk.length > 0 && currentTokens + messageTokens > maxTokens) {
+    if (currentChunk.length > 0 && currentTokens + messageTokens > effectiveMax) {
      chunks.push(currentChunk);
      currentChunk = [];
      currentTokens = 0;
@@ -91,7 +100,7 @@ export function chunkMessagesByMaxTokens(
    currentChunk.push(message);
    currentTokens += messageTokens;

-    if (messageTokens > maxTokens) {
+    if (messageTokens > effectiveMax) {
      // Split oversized messages to avoid unbounded chunk growth.
      chunks.push(currentChunk);
      currentChunk = [];