memory-neo4j: purge noise, tighten auto-capture filters, cap sleep cycle dedup

- Add 11 ASSISTANT_NARRATION_PATTERNS to reject play-by-play self-talk
  ("Let me check...", "I'll run...", "Starting...", "Good! The...", etc.)
- Cap Phase 1b semantic dedup to 50 pairs (sorted by similarity desc)
  to prevent sleep cycle timeouts on large memory sets
- Raise user auto-capture importance threshold from 0.3 to 0.5
- Raise assistant auto-capture importance threshold from 0.7 to 0.8
- Raise MIN_WORD_COUNT from 5 to 8 for user attention gate
- Neo4j cleanup: deleted 155 noise entries (394→242 memories),
  recategorized 2 misplaced entries, stripped Slack metadata from 1

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Tarun Sukhani
2026-02-10 13:03:41 +08:00
parent 309c5b6029
commit e0e98c2c0d
5 changed files with 166 additions and 18 deletions

View File

@@ -29,6 +29,11 @@ const NOISE_PATTERNS = [
// --- Session reset prompts (from /new and /reset commands) ---
/^A new session was started via/i,
// --- Raw chat messages with channel metadata (autocaptured noise) ---
/\[slack message id:/i,
/\[message_id:/i,
/\[telegram message id:/i,
// --- System infrastructure messages (never user-generated) ---
// Heartbeat prompts
/Read HEARTBEAT\.md if it exists/i,
@@ -51,7 +56,7 @@ const MAX_CAPTURE_CHARS = 2000;
const MIN_CAPTURE_CHARS = 30;
/** Minimum word count — short contextual phrases lack standalone meaning. */
const MIN_WORD_COUNT = 5;
const MIN_WORD_COUNT = 8;
export function passesAttentionGate(text: string): boolean {
const trimmed = text.trim();
@@ -100,6 +105,34 @@ const MAX_ASSISTANT_CAPTURE_CHARS = 1000;
/** Minimum word count for assistant messages — higher than user. */
const MIN_ASSISTANT_WORD_COUNT = 10;
/**
* Patterns that reject assistant self-narration — play-by-play commentary
* that reads like thinking out loud rather than a conclusion or fact.
* These are the single biggest source of noise in auto-captured assistant memories.
*/
const ASSISTANT_NARRATION_PATTERNS = [
// "Let me ..." / "Now let me ..." / "I'll ..." action narration
/^(ok[,.]?\s+)?(now\s+)?let me\s+(check|look|see|try|run|start|test|read|update|verify|fix|search|process|create|build|set up|examine|investigate|query|fetch|pull|scan|clean|install|download|configure)/i,
// "I'll ..." action narration
/^I('ll| will)\s+(check|look|see|try|run|start|test|read|update|verify|fix|search|process|create|build|set up|examine|investigate|query|fetch|pull|scan|clean|install|download|configure|execute|help|handle)/i,
// "Starting ..." / "Running ..." / "Processing ..." status updates
/^(starting|running|processing|checking|fetching|scanning|building|installing|downloading|configuring|executing|loading|updating)\s/i,
// "Good!" / "Great!" / "Perfect!" as opener followed by narration
/^(good|great|perfect|nice|excellent|awesome|done)[!.]?\s+(i |the |now |let |we |that )/i,
// Progress narration: "Now I have..." / "Now I can see..." / "Now let me..."
/^now\s+(i\s+(have|can|need|see|understand)|we\s+(have|can|need)|the\s)/i,
// Step narration: "Step 1:" / "**Step 1:**"
/^\*?\*?step\s+\d/i,
// Narration of what was found/done: "Found it." / "Found X." / "I see — ..."
/^(found it|found the|i see\s*[—–-])/i,
// Sub-agent task descriptions (workflow narration)
/^\[?(mon|tue|wed|thu|fri|sat|sun)\s+\d{4}-\d{2}-\d{2}/i,
// Context compaction self-announcements
/^🔄\s*\*?\*?context reset/i,
// Filename slug generation prompts (internal tool use)
/^based on this conversation,?\s*generate a short/i,
];
export function passesAssistantAttentionGate(text: string): boolean {
const trimmed = text.trim();
@@ -144,6 +177,11 @@ export function passesAssistantAttentionGate(text: string): boolean {
return false;
}
// Assistant-specific narration patterns (play-by-play self-talk)
if (ASSISTANT_NARRATION_PATTERNS.some((r) => r.test(trimmed))) {
return false;
}
// Excessive emoji (likely reaction, not substance)
const emojiCount = (
trimmed.match(/[\u{1F300}-\u{1F9FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1FA00}-\u{1FAFF}]/gu) ||

View File

@@ -146,14 +146,26 @@ describe("passesAttentionGate", () => {
});
it("should accept messages with specific information/preferences", () => {
expect(passesAttentionGate("I prefer using TypeScript over JavaScript")).toBe(true);
expect(passesAttentionGate("My meeting with John is on Thursday")).toBe(true);
expect(passesAttentionGate("The project deadline was moved to March")).toBe(true);
expect(
passesAttentionGate("I strongly prefer using TypeScript over JavaScript for all projects"),
).toBe(true);
expect(
passesAttentionGate("My important meeting with John is scheduled for Thursday afternoon"),
).toBe(true);
expect(
passesAttentionGate("The project deadline was moved to March due to client feedback"),
).toBe(true);
});
it("should accept actionable requests with context", () => {
expect(passesAttentionGate("Let's limit the wa-group-monitoring to business hours")).toBe(true);
expect(passesAttentionGate("Can you check the error logs on the production server")).toBe(true);
expect(
passesAttentionGate("Let's limit the wa-group-monitoring cron job to business hours only"),
).toBe(true);
expect(
passesAttentionGate(
"Can you check the error logs on the production server for recent failures",
),
).toBe(true);
});
});
@@ -1334,6 +1346,84 @@ describe("passesAssistantAttentionGate", () => {
expect(passesAssistantAttentionGate("ok")).toBe(false);
expect(passesAssistantAttentionGate("sounds good")).toBe(false);
});
it("should reject 'Let me...' action narration", () => {
expect(
passesAssistantAttentionGate(
"Let me check the error logs on the production server for recent failures and report back.",
),
).toBe(false);
expect(
passesAssistantAttentionGate(
"Now let me update the dashboard and send the Slack report with today's results:",
),
).toBe(false);
expect(
passesAssistantAttentionGate(
"Let me run the LinkedIn parallel outreach job and start by setting up the search term rotation.",
),
).toBe(false);
});
it("should reject 'I'll...' action narration", () => {
expect(
passesAssistantAttentionGate(
"I'll run the email labeler to classify any unread, unlabeled emails right now.",
),
).toBe(false);
expect(
passesAssistantAttentionGate(
"I'll check for newly accepted LinkedIn connections and update the tracker spreadsheet.",
),
).toBe(false);
});
it("should reject 'Starting/Running/Processing...' status updates", () => {
expect(
passesAssistantAttentionGate(
"Starting LinkedIn outreach for Training category using profile linkedin-3 with isolated browser.",
),
).toBe(false);
expect(
passesAssistantAttentionGate(
"Processing through extraction steadily doing eight at a time against local Qwen model.",
),
).toBe(false);
});
it("should reject 'Good!/Perfect!' opener narration", () => {
expect(
passesAssistantAttentionGate(
"Good! I can see the search results. I've identified several 2nd-degree prospects to connect with.",
),
).toBe(false);
expect(
passesAssistantAttentionGate(
"Perfect! The connection dialog appeared. I'll click Add a note to add the personalized message.",
),
).toBe(false);
});
it("should reject context compaction announcements", () => {
expect(
passesAssistantAttentionGate(
"\u{1F504} **Context Reset** \u{2014} My memory was just compacted. Last thing I remember: setting up Flux 2.",
),
).toBe(false);
});
it("should still accept substantive assistant conclusions", () => {
expect(
passesAssistantAttentionGate(
"The memory-neo4j plugin uses confidence-weighted RRF for search result fusion and a 3-signal hybrid search combining HNSW, BM25, and graph traversal.",
),
).toBe(true);
expect(
passesAssistantAttentionGate(
"Whisper wins accuracy across all tests while SenseVoice wins speed at seventeen to thirty-four times faster processing.",
),
).toBe(true);
});
});
// ============================================================================

View File

@@ -806,6 +806,23 @@ export async function runSleepCycle(
}
}
// Cap the number of LLM-checked pairs to prevent sleep cycle timeouts.
// Sort by similarity descending so higher-similarity pairs (more likely
// to be duplicates) are checked first.
const MAX_SEMANTIC_DEDUP_PAIRS = 50;
if (allPairs.length > MAX_SEMANTIC_DEDUP_PAIRS) {
allPairs.sort((a, b) => (b.similarity ?? 0) - (a.similarity ?? 0));
const skipped = allPairs.length - MAX_SEMANTIC_DEDUP_PAIRS;
allPairs.length = MAX_SEMANTIC_DEDUP_PAIRS;
onProgress?.(
"semanticDedup",
`Capped at ${MAX_SEMANTIC_DEDUP_PAIRS} pairs (${skipped} lower-similarity pairs skipped)`,
);
logger.info(
`memory-neo4j: [sleep] Phase 1b capped to ${MAX_SEMANTIC_DEDUP_PAIRS} pairs (${skipped} skipped)`,
);
}
// Process pairs in concurrent batches
const invalidatedIds = new Set<string>();

View File

@@ -57,11 +57,10 @@ describe("passesAttentionGate", () => {
});
it("should accept messages at exactly 30 characters with sufficient words", () => {
// 30 chars, 5 words: "abcde abcde abcde abcde abcde" = 29 chars (5*5 + 4 spaces)
// Need 30+ chars and 5+ words
const text = "abcdef abcdef abcdef abcdef ab";
expect(text.length).toBe(30);
expect(text.split(/\s+/).length).toBeGreaterThanOrEqual(5);
// Need 30+ chars and 8+ words
const text = "ab cd ef gh ij kl mn op qr st u";
expect(text.length).toBeGreaterThanOrEqual(30);
expect(text.split(/\s+/).length).toBeGreaterThanOrEqual(8);
expect(passesAttentionGate(text)).toBe(true);
});
@@ -81,15 +80,19 @@ describe("passesAttentionGate", () => {
// -----------------------------------------------------------------------
describe("word count", () => {
it("should reject messages with fewer than 5 words", () => {
// 4 words, but long enough in chars (> 30)
it("should reject messages with fewer than 8 words", () => {
// 7 words, but long enough in chars (> 30)
expect(
passesAttentionGate("thisislongword anotherlongword thirdlongword fourthlongword"),
passesAttentionGate(
"thisislongword anotherlongword thirdlongword fourthlongword fifth sixth seventh",
),
).toBe(false);
});
it("should accept messages with exactly 5 words", () => {
expect(passesAttentionGate("thisword thatword another fourth fifthword")).toBe(true);
it("should accept messages with exactly 8 words", () => {
expect(
passesAttentionGate("thisword thatword another fourth fifthword sixth seventh eighth"),
).toBe(true);
});
});

View File

@@ -1285,7 +1285,7 @@ async function runAutoCapture(
const result = await captureMessage(
text,
"auto-capture",
0.3,
0.5,
1.0,
agentId,
sessionKey,
@@ -1312,7 +1312,7 @@ async function runAutoCapture(
const result = await captureMessage(
text,
"auto-capture-assistant",
0.7,
0.8,
0.75,
agentId,
sessionKey,