memory-neo4j: purge noise, tighten auto-capture filters, cap sleep cycle dedup

- Add 11 ASSISTANT_NARRATION_PATTERNS to reject play-by-play self-talk ("Let me check...", "I'll run...", "Starting...", "Good! The...", etc.) - Cap Phase 1b semantic dedup to 50 pairs (sorted by similarity desc) to prevent sleep cycle timeouts on large memory sets - Raise user auto-capture importance threshold from 0.3 to 0.5 - Raise assistant auto-capture importance threshold from 0.7 to 0.8 - Raise MIN_WORD_COUNT from 5 to 8 for user attention gate - Neo4j cleanup: deleted 155 noise entries (394→242 memories), recategorized 2 misplaced entries, stripped Slack metadata from 1 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-21 10:04:58 +00:00 · 2026-02-10 13:03:41 +08:00
parent 309c5b6029
commit e0e98c2c0d
5 changed files with 166 additions and 18 deletions
--- a/extensions/memory-neo4j/attention-gate.ts
+++ b/extensions/memory-neo4j/attention-gate.ts
@@ -29,6 +29,11 @@ const NOISE_PATTERNS = [
  // --- Session reset prompts (from /new and /reset commands) ---
  /^A new session was started via/i,

+  // --- Raw chat messages with channel metadata (autocaptured noise) ---
+  /\[slack message id:/i,
+  /\[message_id:/i,
+  /\[telegram message id:/i,
+
  // --- System infrastructure messages (never user-generated) ---
  // Heartbeat prompts
  /Read HEARTBEAT\.md if it exists/i,
@@ -51,7 +56,7 @@ const MAX_CAPTURE_CHARS = 2000;
 const MIN_CAPTURE_CHARS = 30;

 /** Minimum word count — short contextual phrases lack standalone meaning. */
-const MIN_WORD_COUNT = 5;
+const MIN_WORD_COUNT = 8;

 export function passesAttentionGate(text: string): boolean {
  const trimmed = text.trim();
@@ -100,6 +105,34 @@ const MAX_ASSISTANT_CAPTURE_CHARS = 1000;
 /** Minimum word count for assistant messages — higher than user. */
 const MIN_ASSISTANT_WORD_COUNT = 10;

+/**
+ * Patterns that reject assistant self-narration — play-by-play commentary
+ * that reads like thinking out loud rather than a conclusion or fact.
+ * These are the single biggest source of noise in auto-captured assistant memories.
+ */
+const ASSISTANT_NARRATION_PATTERNS = [
+  // "Let me ..." / "Now let me ..." / "I'll ..." action narration
+  /^(ok[,.]?\s+)?(now\s+)?let me\s+(check|look|see|try|run|start|test|read|update|verify|fix|search|process|create|build|set up|examine|investigate|query|fetch|pull|scan|clean|install|download|configure)/i,
+  // "I'll ..." action narration
+  /^I('ll| will)\s+(check|look|see|try|run|start|test|read|update|verify|fix|search|process|create|build|set up|examine|investigate|query|fetch|pull|scan|clean|install|download|configure|execute|help|handle)/i,
+  // "Starting ..." / "Running ..." / "Processing ..." status updates
+  /^(starting|running|processing|checking|fetching|scanning|building|installing|downloading|configuring|executing|loading|updating)\s/i,
+  // "Good!" / "Great!" / "Perfect!" as opener followed by narration
+  /^(good|great|perfect|nice|excellent|awesome|done)[!.]?\s+(i |the |now |let |we |that )/i,
+  // Progress narration: "Now I have..." / "Now I can see..." / "Now let me..."
+  /^now\s+(i\s+(have|can|need|see|understand)|we\s+(have|can|need)|the\s)/i,
+  // Step narration: "Step 1:" / "**Step 1:**"
+  /^\*?\*?step\s+\d/i,
+  // Narration of what was found/done: "Found it." / "Found X." / "I see — ..."
+  /^(found it|found the|i see\s*[—–-])/i,
+  // Sub-agent task descriptions (workflow narration)
+  /^\[?(mon|tue|wed|thu|fri|sat|sun)\s+\d{4}-\d{2}-\d{2}/i,
+  // Context compaction self-announcements
+  /^🔄\s*\*?\*?context reset/i,
+  // Filename slug generation prompts (internal tool use)
+  /^based on this conversation,?\s*generate a short/i,
+];
+
 export function passesAssistantAttentionGate(text: string): boolean {
  const trimmed = text.trim();

@@ -144,6 +177,11 @@ export function passesAssistantAttentionGate(text: string): boolean {
    return false;
  }

+  // Assistant-specific narration patterns (play-by-play self-talk)
+  if (ASSISTANT_NARRATION_PATTERNS.some((r) => r.test(trimmed))) {
+    return false;
+  }
+
  // Excessive emoji (likely reaction, not substance)
  const emojiCount = (
    trimmed.match(/[\u{1F300}-\u{1F9FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1FA00}-\u{1FAFF}]/gu) ||
--- a/extensions/memory-neo4j/extractor.test.ts
+++ b/extensions/memory-neo4j/extractor.test.ts
@@ -146,14 +146,26 @@ describe("passesAttentionGate", () => {
  });

  it("should accept messages with specific information/preferences", () => {
-    expect(passesAttentionGate("I prefer using TypeScript over JavaScript")).toBe(true);
-    expect(passesAttentionGate("My meeting with John is on Thursday")).toBe(true);
-    expect(passesAttentionGate("The project deadline was moved to March")).toBe(true);
+    expect(
+      passesAttentionGate("I strongly prefer using TypeScript over JavaScript for all projects"),
+    ).toBe(true);
+    expect(
+      passesAttentionGate("My important meeting with John is scheduled for Thursday afternoon"),
+    ).toBe(true);
+    expect(
+      passesAttentionGate("The project deadline was moved to March due to client feedback"),
+    ).toBe(true);
  });

  it("should accept actionable requests with context", () => {
-    expect(passesAttentionGate("Let's limit the wa-group-monitoring to business hours")).toBe(true);
-    expect(passesAttentionGate("Can you check the error logs on the production server")).toBe(true);
+    expect(
+      passesAttentionGate("Let's limit the wa-group-monitoring cron job to business hours only"),
+    ).toBe(true);
+    expect(
+      passesAttentionGate(
+        "Can you check the error logs on the production server for recent failures",
+      ),
+    ).toBe(true);
  });
 });

@@ -1334,6 +1346,84 @@ describe("passesAssistantAttentionGate", () => {
    expect(passesAssistantAttentionGate("ok")).toBe(false);
    expect(passesAssistantAttentionGate("sounds good")).toBe(false);
  });
+
+  it("should reject 'Let me...' action narration", () => {
+    expect(
+      passesAssistantAttentionGate(
+        "Let me check the error logs on the production server for recent failures and report back.",
+      ),
+    ).toBe(false);
+    expect(
+      passesAssistantAttentionGate(
+        "Now let me update the dashboard and send the Slack report with today's results:",
+      ),
+    ).toBe(false);
+    expect(
+      passesAssistantAttentionGate(
+        "Let me run the LinkedIn parallel outreach job and start by setting up the search term rotation.",
+      ),
+    ).toBe(false);
+  });
+
+  it("should reject 'I'll...' action narration", () => {
+    expect(
+      passesAssistantAttentionGate(
+        "I'll run the email labeler to classify any unread, unlabeled emails right now.",
+      ),
+    ).toBe(false);
+    expect(
+      passesAssistantAttentionGate(
+        "I'll check for newly accepted LinkedIn connections and update the tracker spreadsheet.",
+      ),
+    ).toBe(false);
+  });
+
+  it("should reject 'Starting/Running/Processing...' status updates", () => {
+    expect(
+      passesAssistantAttentionGate(
+        "Starting LinkedIn outreach for Training category using profile linkedin-3 with isolated browser.",
+      ),
+    ).toBe(false);
+    expect(
+      passesAssistantAttentionGate(
+        "Processing through extraction steadily doing eight at a time against local Qwen model.",
+      ),
+    ).toBe(false);
+  });
+
+  it("should reject 'Good!/Perfect!' opener narration", () => {
+    expect(
+      passesAssistantAttentionGate(
+        "Good! I can see the search results. I've identified several 2nd-degree prospects to connect with.",
+      ),
+    ).toBe(false);
+    expect(
+      passesAssistantAttentionGate(
+        "Perfect! The connection dialog appeared. I'll click Add a note to add the personalized message.",
+      ),
+    ).toBe(false);
+  });
+
+  it("should reject context compaction announcements", () => {
+    expect(
+      passesAssistantAttentionGate(
+        "\u{1F504} **Context Reset** \u{2014} My memory was just compacted. Last thing I remember: setting up Flux 2.",
+      ),
+    ).toBe(false);
+  });
+
+  it("should still accept substantive assistant conclusions", () => {
+    expect(
+      passesAssistantAttentionGate(
+        "The memory-neo4j plugin uses confidence-weighted RRF for search result fusion and a 3-signal hybrid search combining HNSW, BM25, and graph traversal.",
+      ),
+    ).toBe(true);
+    expect(
+      passesAssistantAttentionGate(
+        "Whisper wins accuracy across all tests while SenseVoice wins speed at seventeen to thirty-four times faster processing.",
+      ),
+    ).toBe(true);
+  });
 });

 // ============================================================================
--- a/extensions/memory-neo4j/extractor.ts
+++ b/extensions/memory-neo4j/extractor.ts
@@ -806,6 +806,23 @@ export async function runSleepCycle(
          }
        }

+        // Cap the number of LLM-checked pairs to prevent sleep cycle timeouts.
+        // Sort by similarity descending so higher-similarity pairs (more likely
+        // to be duplicates) are checked first.
+        const MAX_SEMANTIC_DEDUP_PAIRS = 50;
+        if (allPairs.length > MAX_SEMANTIC_DEDUP_PAIRS) {
+          allPairs.sort((a, b) => (b.similarity ?? 0) - (a.similarity ?? 0));
+          const skipped = allPairs.length - MAX_SEMANTIC_DEDUP_PAIRS;
+          allPairs.length = MAX_SEMANTIC_DEDUP_PAIRS;
+          onProgress?.(
+            "semanticDedup",
+            `Capped at ${MAX_SEMANTIC_DEDUP_PAIRS} pairs (${skipped} lower-similarity pairs skipped)`,
+          );
+          logger.info(
+            `memory-neo4j: [sleep] Phase 1b capped to ${MAX_SEMANTIC_DEDUP_PAIRS} pairs (${skipped} skipped)`,
+          );
+        }
+
        // Process pairs in concurrent batches
        const invalidatedIds = new Set<string>();

--- a/extensions/memory-neo4j/index.test.ts
+++ b/extensions/memory-neo4j/index.test.ts
@@ -57,11 +57,10 @@ describe("passesAttentionGate", () => {
    });

    it("should accept messages at exactly 30 characters with sufficient words", () => {
-      // 30 chars, 5 words: "abcde abcde abcde abcde abcde" = 29 chars (5*5 + 4 spaces)
-      // Need 30+ chars and 5+ words
-      const text = "abcdef abcdef abcdef abcdef ab";
-      expect(text.length).toBe(30);
-      expect(text.split(/\s+/).length).toBeGreaterThanOrEqual(5);
+      // Need 30+ chars and 8+ words
+      const text = "ab cd ef gh ij kl mn op qr st u";
+      expect(text.length).toBeGreaterThanOrEqual(30);
+      expect(text.split(/\s+/).length).toBeGreaterThanOrEqual(8);
      expect(passesAttentionGate(text)).toBe(true);
    });

@@ -81,15 +80,19 @@ describe("passesAttentionGate", () => {
  // -----------------------------------------------------------------------

  describe("word count", () => {
-    it("should reject messages with fewer than 5 words", () => {
-      // 4 words, but long enough in chars (> 30)
+    it("should reject messages with fewer than 8 words", () => {
+      // 7 words, but long enough in chars (> 30)
      expect(
-        passesAttentionGate("thisislongword anotherlongword thirdlongword fourthlongword"),
+        passesAttentionGate(
+          "thisislongword anotherlongword thirdlongword fourthlongword fifth sixth seventh",
+        ),
      ).toBe(false);
    });

-    it("should accept messages with exactly 5 words", () => {
-      expect(passesAttentionGate("thisword thatword another fourth fifthword")).toBe(true);
+    it("should accept messages with exactly 8 words", () => {
+      expect(
+        passesAttentionGate("thisword thatword another fourth fifthword sixth seventh eighth"),
+      ).toBe(true);
    });
  });

--- a/extensions/memory-neo4j/index.ts
+++ b/extensions/memory-neo4j/index.ts
@@ -1285,7 +1285,7 @@ async function runAutoCapture(
        const result = await captureMessage(
          text,
          "auto-capture",
-          0.3,
+          0.5,
          1.0,
          agentId,
          sessionKey,
@@ -1312,7 +1312,7 @@ async function runAutoCapture(
        const result = await captureMessage(
          text,
          "auto-capture-assistant",
-          0.7,
+          0.8,
          0.75,
          agentId,
          sessionKey,