From e0e98c2c0d1c6285823de1aa88239f0aacd3758a Mon Sep 17 00:00:00 2001
From: Tarun Sukhani <tarun@abundent.com>
Date: Tue, 10 Feb 2026 13:03:41 +0800
Subject: [PATCH] memory-neo4j: purge noise, tighten auto-capture filters, cap
 sleep cycle dedup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add 11 ASSISTANT_NARRATION_PATTERNS to reject play-by-play self-talk
  ("Let me check...", "I'll run...", "Starting...", "Good! The...", etc.)
- Cap Phase 1b semantic dedup to 50 pairs (sorted by similarity desc)
  to prevent sleep cycle timeouts on large memory sets
- Raise user auto-capture importance threshold from 0.3 to 0.5
- Raise assistant auto-capture importance threshold from 0.7 to 0.8
- Raise MIN_WORD_COUNT from 5 to 8 for user attention gate
- Neo4j cleanup: deleted 155 noise entries (394→242 memories),
  recategorized 2 misplaced entries, stripped Slack metadata from 1

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 extensions/memory-neo4j/attention-gate.ts |  40 ++++++++-
 extensions/memory-neo4j/extractor.test.ts | 100 ++++++++++++++++++++--
 extensions/memory-neo4j/extractor.ts      |  17 ++++
 extensions/memory-neo4j/index.test.ts     |  23 ++---
 extensions/memory-neo4j/index.ts          |   4 +-
 5 files changed, 166 insertions(+), 18 deletions(-)

diff --git a/extensions/memory-neo4j/attention-gate.ts b/extensions/memory-neo4j/attention-gate.ts
index c97e8e17aff..3ac72190264 100644
--- a/extensions/memory-neo4j/attention-gate.ts
+++ b/extensions/memory-neo4j/attention-gate.ts
@@ -29,6 +29,11 @@ const NOISE_PATTERNS = [
   // --- Session reset prompts (from /new and /reset commands) ---
   /^A new session was started via/i,
 
+  // --- Raw chat messages with channel metadata (autocaptured noise) ---
+  /\[slack message id:/i,
+  /\[message_id:/i,
+  /\[telegram message id:/i,
+
   // --- System infrastructure messages (never user-generated) ---
   // Heartbeat prompts
   /Read HEARTBEAT\.md if it exists/i,
@@ -51,7 +56,7 @@ const MAX_CAPTURE_CHARS = 2000;
 const MIN_CAPTURE_CHARS = 30;
 
 /** Minimum word count — short contextual phrases lack standalone meaning. */
-const MIN_WORD_COUNT = 5;
+const MIN_WORD_COUNT = 8;
 
 export function passesAttentionGate(text: string): boolean {
   const trimmed = text.trim();
@@ -100,6 +105,34 @@ const MAX_ASSISTANT_CAPTURE_CHARS = 1000;
 /** Minimum word count for assistant messages — higher than user. */
 const MIN_ASSISTANT_WORD_COUNT = 10;
 
+/**
+ * Patterns that reject assistant self-narration — play-by-play commentary
+ * that reads like thinking out loud rather than a conclusion or fact.
+ * These are the single biggest source of noise in auto-captured assistant memories.
+ */
+const ASSISTANT_NARRATION_PATTERNS = [
+  // "Let me ..." / "Now let me ..." / "I'll ..." action narration
+  /^(ok[,.]?\s+)?(now\s+)?let me\s+(check|look|see|try|run|start|test|read|update|verify|fix|search|process|create|build|set up|examine|investigate|query|fetch|pull|scan|clean|install|download|configure)/i,
+  // "I'll ..." action narration
+  /^I('ll| will)\s+(check|look|see|try|run|start|test|read|update|verify|fix|search|process|create|build|set up|examine|investigate|query|fetch|pull|scan|clean|install|download|configure|execute|help|handle)/i,
+  // "Starting ..." / "Running ..." / "Processing ..." status updates
+  /^(starting|running|processing|checking|fetching|scanning|building|installing|downloading|configuring|executing|loading|updating)\s/i,
+  // "Good!" / "Great!" / "Perfect!" as opener followed by narration
+  /^(good|great|perfect|nice|excellent|awesome|done)[!.]?\s+(i |the |now |let |we |that )/i,
+  // Progress narration: "Now I have..." / "Now I can see..." / "Now let me..."
+  /^now\s+(i\s+(have|can|need|see|understand)|we\s+(have|can|need)|the\s)/i,
+  // Step narration: "Step 1:" / "**Step 1:**"
+  /^\*?\*?step\s+\d/i,
+  // Narration of what was found/done: "Found it." / "Found X." / "I see — ..."
+  /^(found it|found the|i see\s*[—–-])/i,
+  // Sub-agent task descriptions (workflow narration)
+  /^\[?(mon|tue|wed|thu|fri|sat|sun)\s+\d{4}-\d{2}-\d{2}/i,
+  // Context compaction self-announcements
+  /^🔄\s*\*?\*?context reset/i,
+  // Filename slug generation prompts (internal tool use)
+  /^based on this conversation,?\s*generate a short/i,
+];
+
 export function passesAssistantAttentionGate(text: string): boolean {
   const trimmed = text.trim();
 
@@ -144,6 +177,11 @@ export function passesAssistantAttentionGate(text: string): boolean {
     return false;
   }
 
+  // Assistant-specific narration patterns (play-by-play self-talk)
+  if (ASSISTANT_NARRATION_PATTERNS.some((r) => r.test(trimmed))) {
+    return false;
+  }
+
   // Excessive emoji (likely reaction, not substance)
   const emojiCount = (
     trimmed.match(/[\u{1F300}-\u{1F9FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1FA00}-\u{1FAFF}]/gu) ||
diff --git a/extensions/memory-neo4j/extractor.test.ts b/extensions/memory-neo4j/extractor.test.ts
index 575f6622ae7..7bb41012826 100644
--- a/extensions/memory-neo4j/extractor.test.ts
+++ b/extensions/memory-neo4j/extractor.test.ts
@@ -146,14 +146,26 @@ describe("passesAttentionGate", () => {
   });
 
   it("should accept messages with specific information/preferences", () => {
-    expect(passesAttentionGate("I prefer using TypeScript over JavaScript")).toBe(true);
-    expect(passesAttentionGate("My meeting with John is on Thursday")).toBe(true);
-    expect(passesAttentionGate("The project deadline was moved to March")).toBe(true);
+    expect(
+      passesAttentionGate("I strongly prefer using TypeScript over JavaScript for all projects"),
+    ).toBe(true);
+    expect(
+      passesAttentionGate("My important meeting with John is scheduled for Thursday afternoon"),
+    ).toBe(true);
+    expect(
+      passesAttentionGate("The project deadline was moved to March due to client feedback"),
+    ).toBe(true);
   });
 
   it("should accept actionable requests with context", () => {
-    expect(passesAttentionGate("Let's limit the wa-group-monitoring to business hours")).toBe(true);
-    expect(passesAttentionGate("Can you check the error logs on the production server")).toBe(true);
+    expect(
+      passesAttentionGate("Let's limit the wa-group-monitoring cron job to business hours only"),
+    ).toBe(true);
+    expect(
+      passesAttentionGate(
+        "Can you check the error logs on the production server for recent failures",
+      ),
+    ).toBe(true);
   });
 });
 
@@ -1334,6 +1346,84 @@ describe("passesAssistantAttentionGate", () => {
     expect(passesAssistantAttentionGate("ok")).toBe(false);
     expect(passesAssistantAttentionGate("sounds good")).toBe(false);
   });
+
+  it("should reject 'Let me...' action narration", () => {
+    expect(
+      passesAssistantAttentionGate(
+        "Let me check the error logs on the production server for recent failures and report back.",
+      ),
+    ).toBe(false);
+    expect(
+      passesAssistantAttentionGate(
+        "Now let me update the dashboard and send the Slack report with today's results:",
+      ),
+    ).toBe(false);
+    expect(
+      passesAssistantAttentionGate(
+        "Let me run the LinkedIn parallel outreach job and start by setting up the search term rotation.",
+      ),
+    ).toBe(false);
+  });
+
+  it("should reject 'I'll...' action narration", () => {
+    expect(
+      passesAssistantAttentionGate(
+        "I'll run the email labeler to classify any unread, unlabeled emails right now.",
+      ),
+    ).toBe(false);
+    expect(
+      passesAssistantAttentionGate(
+        "I'll check for newly accepted LinkedIn connections and update the tracker spreadsheet.",
+      ),
+    ).toBe(false);
+  });
+
+  it("should reject 'Starting/Running/Processing...' status updates", () => {
+    expect(
+      passesAssistantAttentionGate(
+        "Starting LinkedIn outreach for Training category using profile linkedin-3 with isolated browser.",
+      ),
+    ).toBe(false);
+    expect(
+      passesAssistantAttentionGate(
+        "Processing through extraction steadily doing eight at a time against local Qwen model.",
+      ),
+    ).toBe(false);
+  });
+
+  it("should reject 'Good!/Perfect!' opener narration", () => {
+    expect(
+      passesAssistantAttentionGate(
+        "Good! I can see the search results. I've identified several 2nd-degree prospects to connect with.",
+      ),
+    ).toBe(false);
+    expect(
+      passesAssistantAttentionGate(
+        "Perfect! The connection dialog appeared. I'll click Add a note to add the personalized message.",
+      ),
+    ).toBe(false);
+  });
+
+  it("should reject context compaction announcements", () => {
+    expect(
+      passesAssistantAttentionGate(
+        "\u{1F504} **Context Reset** \u{2014} My memory was just compacted. Last thing I remember: setting up Flux 2.",
+      ),
+    ).toBe(false);
+  });
+
+  it("should still accept substantive assistant conclusions", () => {
+    expect(
+      passesAssistantAttentionGate(
+        "The memory-neo4j plugin uses confidence-weighted RRF for search result fusion and a 3-signal hybrid search combining HNSW, BM25, and graph traversal.",
+      ),
+    ).toBe(true);
+    expect(
+      passesAssistantAttentionGate(
+        "Whisper wins accuracy across all tests while SenseVoice wins speed at seventeen to thirty-four times faster processing.",
+      ),
+    ).toBe(true);
+  });
 });
 
 // ============================================================================
diff --git a/extensions/memory-neo4j/extractor.ts b/extensions/memory-neo4j/extractor.ts
index 9b4dc077431..eee06ddca01 100644
--- a/extensions/memory-neo4j/extractor.ts
+++ b/extensions/memory-neo4j/extractor.ts
@@ -806,6 +806,23 @@ export async function runSleepCycle(
           }
         }
 
+        // Cap the number of LLM-checked pairs to prevent sleep cycle timeouts.
+        // Sort by similarity descending so higher-similarity pairs (more likely
+        // to be duplicates) are checked first.
+        const MAX_SEMANTIC_DEDUP_PAIRS = 50;
+        if (allPairs.length > MAX_SEMANTIC_DEDUP_PAIRS) {
+          allPairs.sort((a, b) => (b.similarity ?? 0) - (a.similarity ?? 0));
+          const skipped = allPairs.length - MAX_SEMANTIC_DEDUP_PAIRS;
+          allPairs.length = MAX_SEMANTIC_DEDUP_PAIRS;
+          onProgress?.(
+            "semanticDedup",
+            `Capped at ${MAX_SEMANTIC_DEDUP_PAIRS} pairs (${skipped} lower-similarity pairs skipped)`,
+          );
+          logger.info(
+            `memory-neo4j: [sleep] Phase 1b capped to ${MAX_SEMANTIC_DEDUP_PAIRS} pairs (${skipped} skipped)`,
+          );
+        }
+
         // Process pairs in concurrent batches
         const invalidatedIds = new Set<string>();
 
diff --git a/extensions/memory-neo4j/index.test.ts b/extensions/memory-neo4j/index.test.ts
index e6dd66149d4..b503100bf9e 100644
--- a/extensions/memory-neo4j/index.test.ts
+++ b/extensions/memory-neo4j/index.test.ts
@@ -57,11 +57,10 @@ describe("passesAttentionGate", () => {
     });
 
     it("should accept messages at exactly 30 characters with sufficient words", () => {
-      // 30 chars, 5 words: "abcde abcde abcde abcde abcde" = 29 chars (5*5 + 4 spaces)
-      // Need 30+ chars and 5+ words
-      const text = "abcdef abcdef abcdef abcdef ab";
-      expect(text.length).toBe(30);
-      expect(text.split(/\s+/).length).toBeGreaterThanOrEqual(5);
+      // Need 30+ chars and 8+ words
+      const text = "ab cd ef gh ij kl mn op qr st u";
+      expect(text.length).toBeGreaterThanOrEqual(30);
+      expect(text.split(/\s+/).length).toBeGreaterThanOrEqual(8);
       expect(passesAttentionGate(text)).toBe(true);
     });
 
@@ -81,15 +80,19 @@ describe("passesAttentionGate", () => {
   // -----------------------------------------------------------------------
 
   describe("word count", () => {
-    it("should reject messages with fewer than 5 words", () => {
-      // 4 words, but long enough in chars (> 30)
+    it("should reject messages with fewer than 8 words", () => {
+      // 7 words, but long enough in chars (> 30)
       expect(
-        passesAttentionGate("thisislongword anotherlongword thirdlongword fourthlongword"),
+        passesAttentionGate(
+          "thisislongword anotherlongword thirdlongword fourthlongword fifth sixth seventh",
+        ),
       ).toBe(false);
     });
 
-    it("should accept messages with exactly 5 words", () => {
-      expect(passesAttentionGate("thisword thatword another fourth fifthword")).toBe(true);
+    it("should accept messages with exactly 8 words", () => {
+      expect(
+        passesAttentionGate("thisword thatword another fourth fifthword sixth seventh eighth"),
+      ).toBe(true);
     });
   });
 
diff --git a/extensions/memory-neo4j/index.ts b/extensions/memory-neo4j/index.ts
index 252baa281ba..7f66bf7e2ce 100644
--- a/extensions/memory-neo4j/index.ts
+++ b/extensions/memory-neo4j/index.ts
@@ -1285,7 +1285,7 @@ async function runAutoCapture(
         const result = await captureMessage(
           text,
           "auto-capture",
-          0.3,
+          0.5,
           1.0,
           agentId,
           sessionKey,
@@ -1312,7 +1312,7 @@ async function runAutoCapture(
         const result = await captureMessage(
           text,
           "auto-capture-assistant",
-          0.7,
+          0.8,
           0.75,
           agentId,
           sessionKey,