From e0e98c2c0d1c6285823de1aa88239f0aacd3758a Mon Sep 17 00:00:00 2001 From: Tarun Sukhani Date: Tue, 10 Feb 2026 13:03:41 +0800 Subject: [PATCH] memory-neo4j: purge noise, tighten auto-capture filters, cap sleep cycle dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add 11 ASSISTANT_NARRATION_PATTERNS to reject play-by-play self-talk ("Let me check...", "I'll run...", "Starting...", "Good! The...", etc.) - Cap Phase 1b semantic dedup to 50 pairs (sorted by similarity desc) to prevent sleep cycle timeouts on large memory sets - Raise user auto-capture importance threshold from 0.3 to 0.5 - Raise assistant auto-capture importance threshold from 0.7 to 0.8 - Raise MIN_WORD_COUNT from 5 to 8 for user attention gate - Neo4j cleanup: deleted 155 noise entries (394→242 memories), recategorized 2 misplaced entries, stripped Slack metadata from 1 Co-Authored-By: Claude Opus 4.6 --- extensions/memory-neo4j/attention-gate.ts | 40 ++++++++- extensions/memory-neo4j/extractor.test.ts | 100 ++++++++++++++++++++-- extensions/memory-neo4j/extractor.ts | 17 ++++ extensions/memory-neo4j/index.test.ts | 23 ++--- extensions/memory-neo4j/index.ts | 4 +- 5 files changed, 166 insertions(+), 18 deletions(-) diff --git a/extensions/memory-neo4j/attention-gate.ts b/extensions/memory-neo4j/attention-gate.ts index c97e8e17aff..3ac72190264 100644 --- a/extensions/memory-neo4j/attention-gate.ts +++ b/extensions/memory-neo4j/attention-gate.ts @@ -29,6 +29,11 @@ const NOISE_PATTERNS = [ // --- Session reset prompts (from /new and /reset commands) --- /^A new session was started via/i, + // --- Raw chat messages with channel metadata (autocaptured noise) --- + /\[slack message id:/i, + /\[message_id:/i, + /\[telegram message id:/i, + // --- System infrastructure messages (never user-generated) --- // Heartbeat prompts /Read HEARTBEAT\.md if it exists/i, @@ -51,7 +56,7 @@ const MAX_CAPTURE_CHARS = 2000; const MIN_CAPTURE_CHARS = 30; /** Minimum word count — short contextual phrases lack standalone meaning. */ -const MIN_WORD_COUNT = 5; +const MIN_WORD_COUNT = 8; export function passesAttentionGate(text: string): boolean { const trimmed = text.trim(); @@ -100,6 +105,34 @@ const MAX_ASSISTANT_CAPTURE_CHARS = 1000; /** Minimum word count for assistant messages — higher than user. */ const MIN_ASSISTANT_WORD_COUNT = 10; +/** + * Patterns that reject assistant self-narration — play-by-play commentary + * that reads like thinking out loud rather than a conclusion or fact. + * These are the single biggest source of noise in auto-captured assistant memories. + */ +const ASSISTANT_NARRATION_PATTERNS = [ + // "Let me ..." / "Now let me ..." / "I'll ..." action narration + /^(ok[,.]?\s+)?(now\s+)?let me\s+(check|look|see|try|run|start|test|read|update|verify|fix|search|process|create|build|set up|examine|investigate|query|fetch|pull|scan|clean|install|download|configure)/i, + // "I'll ..." action narration + /^I('ll| will)\s+(check|look|see|try|run|start|test|read|update|verify|fix|search|process|create|build|set up|examine|investigate|query|fetch|pull|scan|clean|install|download|configure|execute|help|handle)/i, + // "Starting ..." / "Running ..." / "Processing ..." status updates + /^(starting|running|processing|checking|fetching|scanning|building|installing|downloading|configuring|executing|loading|updating)\s/i, + // "Good!" / "Great!" / "Perfect!" as opener followed by narration + /^(good|great|perfect|nice|excellent|awesome|done)[!.]?\s+(i |the |now |let |we |that )/i, + // Progress narration: "Now I have..." / "Now I can see..." / "Now let me..." + /^now\s+(i\s+(have|can|need|see|understand)|we\s+(have|can|need)|the\s)/i, + // Step narration: "Step 1:" / "**Step 1:**" + /^\*?\*?step\s+\d/i, + // Narration of what was found/done: "Found it." / "Found X." / "I see — ..." + /^(found it|found the|i see\s*[—–-])/i, + // Sub-agent task descriptions (workflow narration) + /^\[?(mon|tue|wed|thu|fri|sat|sun)\s+\d{4}-\d{2}-\d{2}/i, + // Context compaction self-announcements + /^🔄\s*\*?\*?context reset/i, + // Filename slug generation prompts (internal tool use) + /^based on this conversation,?\s*generate a short/i, +]; + export function passesAssistantAttentionGate(text: string): boolean { const trimmed = text.trim(); @@ -144,6 +177,11 @@ export function passesAssistantAttentionGate(text: string): boolean { return false; } + // Assistant-specific narration patterns (play-by-play self-talk) + if (ASSISTANT_NARRATION_PATTERNS.some((r) => r.test(trimmed))) { + return false; + } + // Excessive emoji (likely reaction, not substance) const emojiCount = ( trimmed.match(/[\u{1F300}-\u{1F9FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1FA00}-\u{1FAFF}]/gu) || diff --git a/extensions/memory-neo4j/extractor.test.ts b/extensions/memory-neo4j/extractor.test.ts index 575f6622ae7..7bb41012826 100644 --- a/extensions/memory-neo4j/extractor.test.ts +++ b/extensions/memory-neo4j/extractor.test.ts @@ -146,14 +146,26 @@ describe("passesAttentionGate", () => { }); it("should accept messages with specific information/preferences", () => { - expect(passesAttentionGate("I prefer using TypeScript over JavaScript")).toBe(true); - expect(passesAttentionGate("My meeting with John is on Thursday")).toBe(true); - expect(passesAttentionGate("The project deadline was moved to March")).toBe(true); + expect( + passesAttentionGate("I strongly prefer using TypeScript over JavaScript for all projects"), + ).toBe(true); + expect( + passesAttentionGate("My important meeting with John is scheduled for Thursday afternoon"), + ).toBe(true); + expect( + passesAttentionGate("The project deadline was moved to March due to client feedback"), + ).toBe(true); }); it("should accept actionable requests with context", () => { - expect(passesAttentionGate("Let's limit the wa-group-monitoring to business hours")).toBe(true); - expect(passesAttentionGate("Can you check the error logs on the production server")).toBe(true); + expect( + passesAttentionGate("Let's limit the wa-group-monitoring cron job to business hours only"), + ).toBe(true); + expect( + passesAttentionGate( + "Can you check the error logs on the production server for recent failures", + ), + ).toBe(true); }); }); @@ -1334,6 +1346,84 @@ describe("passesAssistantAttentionGate", () => { expect(passesAssistantAttentionGate("ok")).toBe(false); expect(passesAssistantAttentionGate("sounds good")).toBe(false); }); + + it("should reject 'Let me...' action narration", () => { + expect( + passesAssistantAttentionGate( + "Let me check the error logs on the production server for recent failures and report back.", + ), + ).toBe(false); + expect( + passesAssistantAttentionGate( + "Now let me update the dashboard and send the Slack report with today's results:", + ), + ).toBe(false); + expect( + passesAssistantAttentionGate( + "Let me run the LinkedIn parallel outreach job and start by setting up the search term rotation.", + ), + ).toBe(false); + }); + + it("should reject 'I'll...' action narration", () => { + expect( + passesAssistantAttentionGate( + "I'll run the email labeler to classify any unread, unlabeled emails right now.", + ), + ).toBe(false); + expect( + passesAssistantAttentionGate( + "I'll check for newly accepted LinkedIn connections and update the tracker spreadsheet.", + ), + ).toBe(false); + }); + + it("should reject 'Starting/Running/Processing...' status updates", () => { + expect( + passesAssistantAttentionGate( + "Starting LinkedIn outreach for Training category using profile linkedin-3 with isolated browser.", + ), + ).toBe(false); + expect( + passesAssistantAttentionGate( + "Processing through extraction steadily doing eight at a time against local Qwen model.", + ), + ).toBe(false); + }); + + it("should reject 'Good!/Perfect!' opener narration", () => { + expect( + passesAssistantAttentionGate( + "Good! I can see the search results. I've identified several 2nd-degree prospects to connect with.", + ), + ).toBe(false); + expect( + passesAssistantAttentionGate( + "Perfect! The connection dialog appeared. I'll click Add a note to add the personalized message.", + ), + ).toBe(false); + }); + + it("should reject context compaction announcements", () => { + expect( + passesAssistantAttentionGate( + "\u{1F504} **Context Reset** \u{2014} My memory was just compacted. Last thing I remember: setting up Flux 2.", + ), + ).toBe(false); + }); + + it("should still accept substantive assistant conclusions", () => { + expect( + passesAssistantAttentionGate( + "The memory-neo4j plugin uses confidence-weighted RRF for search result fusion and a 3-signal hybrid search combining HNSW, BM25, and graph traversal.", + ), + ).toBe(true); + expect( + passesAssistantAttentionGate( + "Whisper wins accuracy across all tests while SenseVoice wins speed at seventeen to thirty-four times faster processing.", + ), + ).toBe(true); + }); }); // ============================================================================ diff --git a/extensions/memory-neo4j/extractor.ts b/extensions/memory-neo4j/extractor.ts index 9b4dc077431..eee06ddca01 100644 --- a/extensions/memory-neo4j/extractor.ts +++ b/extensions/memory-neo4j/extractor.ts @@ -806,6 +806,23 @@ export async function runSleepCycle( } } + // Cap the number of LLM-checked pairs to prevent sleep cycle timeouts. + // Sort by similarity descending so higher-similarity pairs (more likely + // to be duplicates) are checked first. + const MAX_SEMANTIC_DEDUP_PAIRS = 50; + if (allPairs.length > MAX_SEMANTIC_DEDUP_PAIRS) { + allPairs.sort((a, b) => (b.similarity ?? 0) - (a.similarity ?? 0)); + const skipped = allPairs.length - MAX_SEMANTIC_DEDUP_PAIRS; + allPairs.length = MAX_SEMANTIC_DEDUP_PAIRS; + onProgress?.( + "semanticDedup", + `Capped at ${MAX_SEMANTIC_DEDUP_PAIRS} pairs (${skipped} lower-similarity pairs skipped)`, + ); + logger.info( + `memory-neo4j: [sleep] Phase 1b capped to ${MAX_SEMANTIC_DEDUP_PAIRS} pairs (${skipped} skipped)`, + ); + } + // Process pairs in concurrent batches const invalidatedIds = new Set(); diff --git a/extensions/memory-neo4j/index.test.ts b/extensions/memory-neo4j/index.test.ts index e6dd66149d4..b503100bf9e 100644 --- a/extensions/memory-neo4j/index.test.ts +++ b/extensions/memory-neo4j/index.test.ts @@ -57,11 +57,10 @@ describe("passesAttentionGate", () => { }); it("should accept messages at exactly 30 characters with sufficient words", () => { - // 30 chars, 5 words: "abcde abcde abcde abcde abcde" = 29 chars (5*5 + 4 spaces) - // Need 30+ chars and 5+ words - const text = "abcdef abcdef abcdef abcdef ab"; - expect(text.length).toBe(30); - expect(text.split(/\s+/).length).toBeGreaterThanOrEqual(5); + // Need 30+ chars and 8+ words + const text = "ab cd ef gh ij kl mn op qr st u"; + expect(text.length).toBeGreaterThanOrEqual(30); + expect(text.split(/\s+/).length).toBeGreaterThanOrEqual(8); expect(passesAttentionGate(text)).toBe(true); }); @@ -81,15 +80,19 @@ describe("passesAttentionGate", () => { // ----------------------------------------------------------------------- describe("word count", () => { - it("should reject messages with fewer than 5 words", () => { - // 4 words, but long enough in chars (> 30) + it("should reject messages with fewer than 8 words", () => { + // 7 words, but long enough in chars (> 30) expect( - passesAttentionGate("thisislongword anotherlongword thirdlongword fourthlongword"), + passesAttentionGate( + "thisislongword anotherlongword thirdlongword fourthlongword fifth sixth seventh", + ), ).toBe(false); }); - it("should accept messages with exactly 5 words", () => { - expect(passesAttentionGate("thisword thatword another fourth fifthword")).toBe(true); + it("should accept messages with exactly 8 words", () => { + expect( + passesAttentionGate("thisword thatword another fourth fifthword sixth seventh eighth"), + ).toBe(true); }); }); diff --git a/extensions/memory-neo4j/index.ts b/extensions/memory-neo4j/index.ts index 252baa281ba..7f66bf7e2ce 100644 --- a/extensions/memory-neo4j/index.ts +++ b/extensions/memory-neo4j/index.ts @@ -1285,7 +1285,7 @@ async function runAutoCapture( const result = await captureMessage( text, "auto-capture", - 0.3, + 0.5, 1.0, agentId, sessionKey, @@ -1312,7 +1312,7 @@ async function runAutoCapture( const result = await captureMessage( text, "auto-capture-assistant", - 0.7, + 0.8, 0.75, agentId, sessionKey,