From 18b8007d236ee38dbabb3479cef24c9353c4fdcc Mon Sep 17 00:00:00 2001 From: Tarun Sukhani Date: Mon, 16 Feb 2026 15:49:19 +0800 Subject: [PATCH] memory-neo4j: improve tag coverage with stronger extraction + retroactive tagging - Strengthen extraction prompt to always generate 2-4 tags per memory - Add Phase 2b: Retroactive Tagging to sleep cycle for untagged memories - Include 'skipped' memories in extraction pipeline (imported memories) - Add listUntaggedMemories() helper to neo4j-client - Add extractTagsOnly() lightweight prompt for tag-only extraction - Add CLI display for Phase 2b stats Fixes: 79% of memories had zero tags due to weak prompt guidance and imported memories never going through extraction. --- extensions/memory-neo4j/cli.ts | 8 ++ extensions/memory-neo4j/extractor.ts | 86 +++++++++++++++++- extensions/memory-neo4j/neo4j-client.ts | 32 ++++++- extensions/memory-neo4j/sleep-cycle.ts | 111 +++++++++++++++++++++++- 4 files changed, 232 insertions(+), 5 deletions(-) diff --git a/extensions/memory-neo4j/cli.ts b/extensions/memory-neo4j/cli.ts index eca188f7600..d1b3bab8b66 100644 --- a/extensions/memory-neo4j/cli.ts +++ b/extensions/memory-neo4j/cli.ts @@ -315,6 +315,7 @@ export function registerCli(api: OpenClawPluginApi, deps: CliDeps): void { console.log(" Phase 1c: Conflict Detection — Resolve contradictory memories"); console.log(" Phase 1d: Entity Dedup — Merge duplicate entity nodes"); console.log(" Phase 2: Extraction — Extract entities and categorize"); + console.log(" Phase 2b: Retroactive Tagging — Tag memories missing topic tags"); console.log(" Phase 3: Decay & Pruning — Remove stale low-importance memories"); console.log(" Phase 4: Orphan Cleanup — Remove disconnected nodes"); console.log(" Phase 5: Noise Cleanup — Remove dangerous pattern memories"); @@ -399,6 +400,7 @@ export function registerCli(api: OpenClawPluginApi, deps: CliDeps): void { conflict: "Phase 1c: Conflict Detection", entityDedup: "Phase 1d: Entity Deduplication", extraction: "Phase 2: Extraction", + retroactiveTagging: "Phase 2b: Retroactive Tagging", decay: "Phase 3: Decay & Pruning", cleanup: "Phase 4: Orphan Cleanup", noiseCleanup: "Phase 5: Noise Cleanup", @@ -430,6 +432,12 @@ export function registerCli(api: OpenClawPluginApi, deps: CliDeps): void { ` Extraction: ${result.extraction.succeeded}/${result.extraction.total} extracted` + (result.extraction.failed > 0 ? ` (${result.extraction.failed} failed)` : ""), ); + console.log( + ` Retro-Tagging: ${result.retroactiveTagging.tagged}/${result.retroactiveTagging.total} tagged` + + (result.retroactiveTagging.failed > 0 + ? ` (${result.retroactiveTagging.failed} failed)` + : ""), + ); console.log( ` Cleanup: ${result.cleanup.entitiesRemoved} entities, ${result.cleanup.tagsRemoved} tags removed`, ); diff --git a/extensions/memory-neo4j/extractor.ts b/extensions/memory-neo4j/extractor.ts index 5403217e6ce..8450babd0b9 100644 --- a/extensions/memory-neo4j/extractor.ts +++ b/extensions/memory-neo4j/extractor.ts @@ -53,9 +53,40 @@ Rules: - Good entities: "Tarun", "Abundent Academy", "Tioman Island", "LiveKit", "Neo4j", "Fish Speech S1 Mini" - Bad entities: "python", "ai", "automation", "email", "docker", "machine learning", "api" - When in doubt, do NOT extract — fewer high-quality entities beat many generic ones -- Return empty arrays if nothing specific to extract - Keep entity descriptions brief (1 sentence max) -- Category: "preference" for opinions/preferences, "fact" for factual info, "decision" for choices made, "entity" for entity-focused, "other" for miscellaneous`; +- Category: "preference" for opinions/preferences, "fact" for factual info, "decision" for choices made, "entity" for entity-focused, "other" for miscellaneous +- ALWAYS generate at least 2 tags. Every memory has a topic — there are no exceptions. +- Tags describe the TOPIC or DOMAIN of the memory, not the entities themselves. +- Do NOT use entity names as tags (e.g., don't tag "tarun" if Tarun is already an entity). +- Good tags: "travel planning", "family", "voice synthesis", "linkedin automation", "expense tracking", "cron scheduling", "api integration" +- Tag categories: "topic", "domain", "workflow", "technology", "personal", "business" +- Return empty entity/relationship arrays if nothing specific to extract, but NEVER return empty tags.`; + +// ============================================================================ +// Retroactive Tagging Prompt +// ============================================================================ + +/** + * Lightweight prompt for retroactive tagging of memories that were extracted + * without tags. Only asks for tags — no entities or relationships. + */ +const RETROACTIVE_TAGGING_SYSTEM = `You are a topic tagging system for a personal memory store. +Generate 2-4 topic tags that describe what this memory is about. + +Return JSON: +{ + "tags": [ + {"name": "tag name", "category": "topic|domain|workflow|technology|personal|business"} + ] +} + +Rules: +- Tags describe the TOPIC or DOMAIN of the memory, not specific people or tools mentioned. +- Good tags: "travel planning", "family", "voice synthesis", "linkedin automation", "expense tracking", "cron scheduling", "api integration", "system configuration", "memory management" +- Bad tags: names of people, companies, or specific tools (those are entities, not topics) +- Tag categories: "topic" (general subject), "domain" (field/area), "workflow" (process/procedure), "technology" (tech area), "personal" (personal life), "business" (work/business) +- ALWAYS return at least 2 tags. Every memory has a topic. +- Normalize tag names to lowercase with spaces (no hyphens or underscores).`; // ============================================================================ // Entity Extraction @@ -118,6 +149,57 @@ export async function extractEntities( } } +/** + * Extract only tags from a memory text using a lightweight LLM prompt. + * Used for retroactive tagging of memories that were extracted without tags. + * + * Returns an array of tags, or null on failure. + */ +export async function extractTagsOnly( + text: string, + config: ExtractionConfig, + abortSignal?: AbortSignal, +): Promise | null> { + if (!config.enabled) { + return null; + } + + const messages = [ + { role: "system", content: RETROACTIVE_TAGGING_SYSTEM }, + { role: "user", content: text }, + ]; + + let content: string | null; + try { + content = await callOpenRouterStream(config, messages, abortSignal); + } catch { + return null; + } + + if (!content) { + return null; + } + + try { + const parsed = JSON.parse(content) as { tags?: unknown }; + const rawTags = Array.isArray(parsed.tags) ? parsed.tags : []; + return rawTags + .filter( + (t: unknown): t is Record => + t !== null && + typeof t === "object" && + typeof (t as Record).name === "string", + ) + .map((t) => ({ + name: normalizeTagName(String(t.name)), + category: typeof t.category === "string" ? t.category : "topic", + })) + .filter((t) => t.name.length > 0); + } catch { + return null; + } +} + /** * Normalize a tag name: lowercase, collapse hyphens/underscores to spaces, * collapse multiple spaces, trim. Ensures "machine-learning", "machine_learning", diff --git a/extensions/memory-neo4j/neo4j-client.ts b/extensions/memory-neo4j/neo4j-client.ts index ea52fe45d84..c11818b0131 100644 --- a/extensions/memory-neo4j/neo4j-client.ts +++ b/extensions/memory-neo4j/neo4j-client.ts @@ -915,7 +915,7 @@ export class Neo4jMemoryClient { const agentFilter = agentId ? "AND m.agentId = $agentId" : ""; const result = await session.run( `MATCH (m:Memory) - WHERE m.extractionStatus = 'pending' ${agentFilter} + WHERE m.extractionStatus IN ['pending', 'skipped'] ${agentFilter} RETURN m.id AS id, m.text AS text, m.agentId AS agentId, coalesce(m.extractionRetries, 0) AS extractionRetries ORDER BY m.createdAt ASC @@ -967,6 +967,36 @@ export class Neo4jMemoryClient { } } + /** + * List memories with completed extraction but no TAGGED relationships. + * Used by the retroactive tagging phase to find memories that need tags. + */ + async listUntaggedMemories( + limit: number = 50, + agentId?: string, + ): Promise> { + await this.ensureInitialized(); + const session = this.driver!.session(); + try { + const agentFilter = agentId ? "AND m.agentId = $agentId" : ""; + const result = await session.run( + `MATCH (m:Memory) + WHERE m.extractionStatus = 'complete' ${agentFilter} + AND NOT EXISTS { MATCH (m)-[:TAGGED]->(:Tag) } + RETURN m.id AS id, m.text AS text + ORDER BY m.createdAt ASC + LIMIT $limit`, + { limit: neo4j.int(limit), ...(agentId ? { agentId } : {}) }, + ); + return result.records.map((r) => ({ + id: r.get("id") as string, + text: r.get("text") as string, + })); + } finally { + await session.close(); + } + } + // -------------------------------------------------------------------------- // Sleep Cycle: Deduplication // -------------------------------------------------------------------------- diff --git a/extensions/memory-neo4j/sleep-cycle.ts b/extensions/memory-neo4j/sleep-cycle.ts index 44681702c7d..e8e54cda789 100644 --- a/extensions/memory-neo4j/sleep-cycle.ts +++ b/extensions/memory-neo4j/sleep-cycle.ts @@ -23,7 +23,12 @@ import type { ExtractionConfig } from "./config.js"; import type { Embeddings } from "./embeddings.js"; import type { Neo4jMemoryClient } from "./neo4j-client.js"; import type { Logger } from "./schema.js"; -import { isSemanticDuplicate, resolveConflict, runBackgroundExtraction } from "./extractor.js"; +import { + extractTagsOnly, + isSemanticDuplicate, + resolveConflict, + runBackgroundExtraction, +} from "./extractor.js"; import { makePairKey } from "./schema.js"; import { reviewAndArchiveStaleTasks, type StaleTaskResult } from "./task-ledger.js"; @@ -59,6 +64,12 @@ export type SleepCycleResult = { succeeded: number; failed: number; }; + // Phase 2b: Retroactive Tagging + retroactiveTagging: { + total: number; + tagged: number; + failed: number; + }; // Phase 3: Decay & Pruning decay: { memoriesPruned: number; @@ -105,6 +116,10 @@ export type SleepCycleOptions = { extractionBatchSize?: number; // Memories per batch (default: 50) extractionDelayMs?: number; // Delay between batches (default: 1000) + // Phase 2b: Retroactive Tagging + skipRetroactiveTagging?: boolean; // Skip retroactive tagging (default: false) + retroactiveTagBatchSize?: number; // Memories per batch (default: 50) + // Phase 4: Cleanup singleUseTagMinAgeDays?: number; // Min age before single-use tag pruning (default: 14) @@ -127,6 +142,7 @@ export type SleepCycleOptions = { | "entityDedup" | "decay" | "extraction" + | "retroactiveTagging" | "cleanup" | "noiseCleanup" | "credentialScan" @@ -225,6 +241,8 @@ export async function runSleepCycle( decayCurves, extractionBatchSize = 50, extractionDelayMs = 1000, + skipRetroactiveTagging = false, + retroactiveTagBatchSize = 50, singleUseTagMinAgeDays = 14, workspaceDir, staleTaskMaxAgeMs, @@ -239,6 +257,7 @@ export async function runSleepCycle( entityDedup: { pairsFound: 0, merged: 0 }, decay: { memoriesPruned: 0 }, extraction: { total: 0, processed: 0, succeeded: 0, failed: 0 }, + retroactiveTagging: { total: 0, tagged: 0, failed: 0 }, cleanup: { entitiesRemoved: 0, tagsRemoved: 0, singleUseTagsRemoved: 0 }, credentialScan: { memoriesScanned: 0, credentialsFound: 0, memoriesRemoved: 0 }, taskLedger: { staleCount: 0, archivedCount: 0, archivedIds: [] }, @@ -541,7 +560,7 @@ export async function runSleepCycle( try { // Get initial count const counts = await db.countByExtractionStatus(agentId); - result.extraction.total = counts.pending; + result.extraction.total = counts.pending + counts.skipped; if (result.extraction.total > 0) { let hasMore = true; @@ -616,6 +635,94 @@ export async function runSleepCycle( logger.info("memory-neo4j: [sleep] Phase 2 skipped — extraction not enabled"); } + // -------------------------------------------------------------------------- + // Phase 2b: Retroactive Tagging + // Find memories with completed extraction but no tags, and generate tags + // using a lightweight LLM prompt. This fixes the historical gap where + // the extraction prompt treated tags as optional. + // -------------------------------------------------------------------------- + if (!abortSignal?.aborted && config.enabled && !skipRetroactiveTagging) { + onPhaseStart?.("retroactiveTagging"); + logger.info("memory-neo4j: [sleep] Phase 2b: Retroactive Tagging"); + + try { + let hasMore = true; + while (hasMore && !abortSignal?.aborted) { + const untagged = await db.listUntaggedMemories(retroactiveTagBatchSize, agentId); + + if (untagged.length === 0) { + hasMore = false; + break; + } + + // Count total on first batch + if (result.retroactiveTagging.total === 0) { + result.retroactiveTagging.total = untagged.length; + } + + // Process in parallel chunks of llmConcurrency + for (let i = 0; i < untagged.length && !abortSignal?.aborted; i += llmConcurrency) { + const chunk = untagged.slice(i, i + llmConcurrency); + const outcomes = await Promise.allSettled( + chunk.map((memory) => extractTagsOnly(memory.text, config, abortSignal)), + ); + + for (let k = 0; k < outcomes.length; k++) { + const outcome = outcomes[k]; + const memory = chunk[k]; + + if (outcome.status === "fulfilled" && outcome.value && outcome.value.length > 0) { + try { + await db.batchEntityOperations(memory.id, [], [], outcome.value); + result.retroactiveTagging.tagged++; + onProgress?.( + "retroactiveTagging", + `Tagged "${memory.text.slice(0, 50)}..." with ${outcome.value.length} tags`, + ); + } catch (err) { + result.retroactiveTagging.failed++; + logger.warn( + `memory-neo4j: [sleep] retroactive tagging write failed for ${memory.id.slice(0, 8)}: ${String(err)}`, + ); + } + } else { + result.retroactiveTagging.failed++; + } + } + } + + // Check if there are more untagged memories + const nextBatch = await db.listUntaggedMemories(1, agentId); + hasMore = nextBatch.length > 0; + + // Delay between batches (abort-aware) + if (hasMore && !abortSignal?.aborted) { + await new Promise((resolve) => { + const timer = setTimeout(resolve, extractionDelayMs); + abortSignal?.addEventListener( + "abort", + () => { + clearTimeout(timer); + resolve(); + }, + { once: true }, + ); + }); + } + } + + logger.info( + `memory-neo4j: [sleep] Phase 2b complete — ${result.retroactiveTagging.tagged} tagged, ${result.retroactiveTagging.failed} failed`, + ); + } catch (err) { + logger.warn(`memory-neo4j: [sleep] Phase 2b error: ${String(err)}`); + } + } else if (!config.enabled) { + logger.info("memory-neo4j: [sleep] Phase 2b skipped — extraction not enabled"); + } else if (skipRetroactiveTagging) { + logger.info("memory-neo4j: [sleep] Phase 2b skipped — retroactive tagging disabled"); + } + // -------------------------------------------------------------------------- // Phase 3: Decay & Pruning (after extraction so freshly extracted memories // aren't pruned before they build entity connections)