fix(compaction): use full-session token count for post-compaction sanity check

Rebased on upstream main. - Estimate full session tokens (including system prompt, bootstrap context, workspace files) before compaction instead of using result.tokensBefore which only covers the summarizable history subset - Add 10% margin to account for heuristic token counter estimation jitter - Prevents valid token estimates from being discarded in sessions with large system prompts or workspace files
2026-05-18 02:01:03 +00:00 · 2026-03-04 16:24:49 +03:00
parent fc2b796f02
commit 424ca99305
1 changed files with 22 additions and 2 deletions
--- a/src/agents/pi-embedded-runner/compact.ts
+++ b/src/agents/pi-embedded-runner/compact.ts
@@ -897,6 +897,17 @@ export async function compactEmbeddedPiSessionDirect(
        // Measure compactedCount from the original pre-limiting transcript so compaction
        // lifecycle metrics represent total reduction through the compaction pipeline.
        const messageCountCompactionInput = messageCountOriginal;
+        // Estimate full session tokens BEFORE compaction (including system prompt,
+        // bootstrap context, workspace files, and all history). This is needed for
+        // a correct sanity check — result.tokensBefore only covers the summarizable
+        // history subset, not the full session.
+        let fullSessionTokensBefore = 0;
+        try {
+          fullSessionTokensBefore = limited.reduce((sum, msg) => sum + estimateTokens(msg), 0);
+        } catch {
+          // If token estimation throws on a malformed message, skip the sanity check
+          // instead of crashing compaction.
+        }
        const result = await compactWithSafetyTimeout(() =>
          session.compact(params.customInstructions),
        );
@@ -912,8 +923,17 @@ export async function compactEmbeddedPiSessionDirect(
          for (const message of session.messages) {
            tokensAfter += estimateTokens(message);
          }
-          // Sanity check: tokensAfter should be less than tokensBefore
-          if (tokensAfter > (observedTokenCount ?? result.tokensBefore)) {
+          // Sanity check: compare against the best full-session pre-compaction baseline.
+          // Prefer the provider-observed live count when available; otherwise use the
+          // heuristic full-session estimate with a 10% margin for counter jitter.
+          const sanityCheckBaseline = observedTokenCount ?? fullSessionTokensBefore;
+          if (
+            sanityCheckBaseline > 0 &&
+            tokensAfter >
+              (observedTokenCount !== undefined
+                ? sanityCheckBaseline
+                : sanityCheckBaseline * 1.1)
+          ) {
            tokensAfter = undefined; // Don't trust the estimate
          }
        } catch {