memory: add multimodal image and audio indexing

This commit is contained in:
Gustavo Madeira Santana
2026-03-11 20:45:55 +00:00
parent 20d097ac2f
commit 73c9e141a4
21 changed files with 924 additions and 86 deletions

View File

@@ -284,9 +284,45 @@ Notes:
- Paths can be absolute or workspace-relative. - Paths can be absolute or workspace-relative.
- Directories are scanned recursively for `.md` files. - Directories are scanned recursively for `.md` files.
- Only Markdown files are indexed. - By default, only Markdown files are indexed.
- If `memorySearch.multimodal.enabled = true`, OpenClaw also indexes supported image/audio files under `extraPaths` only. Default memory roots (`MEMORY.md`, `memory.md`, `memory/**/*.md`) stay Markdown-only.
- Symlinks are ignored (files or directories). - Symlinks are ignored (files or directories).
### Multimodal memory files (Gemini image + audio)
OpenClaw can index image and audio files from `memorySearch.extraPaths` when using Gemini embedding 2:
```json5
agents: {
defaults: {
memorySearch: {
provider: "gemini",
model: "gemini-embedding-2-preview",
extraPaths: ["assets/reference", "voice-notes"],
multimodal: {
enabled: true,
modalities: ["image", "audio"], // or ["all"]
maxFileBytes: 10000000
},
remote: {
apiKey: "YOUR_GEMINI_API_KEY"
}
}
}
}
```
Notes:
- Multimodal memory is currently supported only for `gemini-embedding-2-preview`.
- Multimodal indexing applies only to files discovered through `memorySearch.extraPaths`.
- Supported modalities in this phase: image and audio.
- `memorySearch.fallback` must stay `"none"` while multimodal memory is enabled.
- Supported image extensions: `.jpg`, `.jpeg`, `.png`, `.webp`, `.gif`, `.heic`, `.heif`.
- Supported audio extensions: `.mp3`, `.wav`, `.ogg`, `.opus`, `.m4a`, `.aac`, `.flac`.
- Search queries remain text, but Gemini can compare those text queries against indexed image/audio embeddings.
- `memory_get` still reads Markdown only; binary files are searchable but not returned as raw file contents.
### Gemini embeddings (native) ### Gemini embeddings (native)
Set the provider to `gemini` to use the Gemini embeddings API directly: Set the provider to `gemini` to use the Gemini embeddings API directly:

View File

@@ -131,6 +131,65 @@ describe("memory search config", () => {
expect(resolved?.extraPaths).toEqual(["/shared/notes", "docs", "../team-notes"]); expect(resolved?.extraPaths).toEqual(["/shared/notes", "docs", "../team-notes"]);
}); });
it("normalizes multimodal settings", () => {
const cfg = asConfig({
agents: {
defaults: {
memorySearch: {
provider: "gemini",
model: "gemini-embedding-2-preview",
multimodal: {
enabled: true,
modalities: ["all"],
maxFileBytes: 8192,
},
},
},
},
});
const resolved = resolveMemorySearchConfig(cfg, "main");
expect(resolved?.multimodal).toEqual({
enabled: true,
modalities: ["image", "audio"],
maxFileBytes: 8192,
});
});
it("rejects multimodal memory on unsupported providers", () => {
const cfg = asConfig({
agents: {
defaults: {
memorySearch: {
provider: "openai",
model: "text-embedding-3-small",
multimodal: { enabled: true, modalities: ["image"] },
},
},
},
});
expect(() => resolveMemorySearchConfig(cfg, "main")).toThrow(
/memorySearch\.multimodal requires memorySearch\.provider = "gemini"/,
);
});
it("rejects multimodal memory when fallback is configured", () => {
const cfg = asConfig({
agents: {
defaults: {
memorySearch: {
provider: "gemini",
model: "gemini-embedding-2-preview",
fallback: "openai",
multimodal: { enabled: true, modalities: ["image"] },
},
},
},
});
expect(() => resolveMemorySearchConfig(cfg, "main")).toThrow(
/memorySearch\.multimodal does not support memorySearch\.fallback/,
);
});
it("includes batch defaults for openai without remote overrides", () => { it("includes batch defaults for openai without remote overrides", () => {
const cfg = configWithDefaultProvider("openai"); const cfg = configWithDefaultProvider("openai");
const resolved = resolveMemorySearchConfig(cfg, "main"); const resolved = resolveMemorySearchConfig(cfg, "main");

View File

@@ -3,6 +3,11 @@ import path from "node:path";
import type { OpenClawConfig, MemorySearchConfig } from "../config/config.js"; import type { OpenClawConfig, MemorySearchConfig } from "../config/config.js";
import { resolveStateDir } from "../config/paths.js"; import { resolveStateDir } from "../config/paths.js";
import type { SecretInput } from "../config/types.secrets.js"; import type { SecretInput } from "../config/types.secrets.js";
import {
normalizeMemoryMultimodalSettings,
supportsMemoryMultimodalEmbeddings,
type MemoryMultimodalSettings,
} from "../memory/multimodal.js";
import { clampInt, clampNumber, resolveUserPath } from "../utils.js"; import { clampInt, clampNumber, resolveUserPath } from "../utils.js";
import { resolveAgentConfig } from "./agent-scope.js"; import { resolveAgentConfig } from "./agent-scope.js";
@@ -10,6 +15,7 @@ export type ResolvedMemorySearchConfig = {
enabled: boolean; enabled: boolean;
sources: Array<"memory" | "sessions">; sources: Array<"memory" | "sessions">;
extraPaths: string[]; extraPaths: string[];
multimodal: MemoryMultimodalSettings;
provider: "openai" | "local" | "gemini" | "voyage" | "mistral" | "ollama" | "auto"; provider: "openai" | "local" | "gemini" | "voyage" | "mistral" | "ollama" | "auto";
remote?: { remote?: {
baseUrl?: string; baseUrl?: string;
@@ -204,6 +210,11 @@ function mergeConfig(
.map((value) => value.trim()) .map((value) => value.trim())
.filter(Boolean); .filter(Boolean);
const extraPaths = Array.from(new Set(rawPaths)); const extraPaths = Array.from(new Set(rawPaths));
const multimodal = normalizeMemoryMultimodalSettings({
enabled: overrides?.multimodal?.enabled ?? defaults?.multimodal?.enabled,
modalities: overrides?.multimodal?.modalities ?? defaults?.multimodal?.modalities,
maxFileBytes: overrides?.multimodal?.maxFileBytes ?? defaults?.multimodal?.maxFileBytes,
});
const vector = { const vector = {
enabled: overrides?.store?.vector?.enabled ?? defaults?.store?.vector?.enabled ?? true, enabled: overrides?.store?.vector?.enabled ?? defaults?.store?.vector?.enabled ?? true,
extensionPath: extensionPath:
@@ -307,6 +318,7 @@ function mergeConfig(
enabled, enabled,
sources, sources,
extraPaths, extraPaths,
multimodal,
provider, provider,
remote, remote,
experimental: { experimental: {
@@ -365,5 +377,21 @@ export function resolveMemorySearchConfig(
if (!resolved.enabled) { if (!resolved.enabled) {
return null; return null;
} }
if (
resolved.multimodal.enabled &&
!supportsMemoryMultimodalEmbeddings({
provider: resolved.provider,
model: resolved.model,
})
) {
throw new Error(
'agents.*.memorySearch.multimodal requires memorySearch.provider = "gemini" and model = "gemini-embedding-2-preview".',
);
}
if (resolved.multimodal.enabled && resolved.fallback !== "none") {
throw new Error(
'agents.*.memorySearch.multimodal does not support memorySearch.fallback. Set fallback to "none".',
);
}
return resolved; return resolved;
} }

View File

@@ -72,6 +72,10 @@ const TARGET_KEYS = [
"agents.defaults.memorySearch.fallback", "agents.defaults.memorySearch.fallback",
"agents.defaults.memorySearch.sources", "agents.defaults.memorySearch.sources",
"agents.defaults.memorySearch.extraPaths", "agents.defaults.memorySearch.extraPaths",
"agents.defaults.memorySearch.multimodal",
"agents.defaults.memorySearch.multimodal.enabled",
"agents.defaults.memorySearch.multimodal.modalities",
"agents.defaults.memorySearch.multimodal.maxFileBytes",
"agents.defaults.memorySearch.experimental.sessionMemory", "agents.defaults.memorySearch.experimental.sessionMemory",
"agents.defaults.memorySearch.remote.baseUrl", "agents.defaults.memorySearch.remote.baseUrl",
"agents.defaults.memorySearch.remote.apiKey", "agents.defaults.memorySearch.remote.apiKey",

View File

@@ -778,7 +778,15 @@ export const FIELD_HELP: Record<string, string> = {
"agents.defaults.memorySearch.sources": "agents.defaults.memorySearch.sources":
'Chooses which sources are indexed: "memory" reads MEMORY.md + memory files, and "sessions" includes transcript history. Keep ["memory"] unless you need recall from prior chat transcripts.', 'Chooses which sources are indexed: "memory" reads MEMORY.md + memory files, and "sessions" includes transcript history. Keep ["memory"] unless you need recall from prior chat transcripts.',
"agents.defaults.memorySearch.extraPaths": "agents.defaults.memorySearch.extraPaths":
"Adds extra directories or .md files to the memory index beyond default memory files. Use this when key reference docs live elsewhere in your repo; keep paths small and intentional to avoid noisy recall.", "Adds extra directories or .md files to the memory index beyond default memory files. Use this when key reference docs live elsewhere in your repo; when multimodal memory is enabled, matching image/audio files under these paths are also eligible for indexing.",
"agents.defaults.memorySearch.multimodal":
'Optional multimodal memory settings for indexing image and audio files from configured extra paths. Keep this off unless your embedding model explicitly supports cross-modal embeddings, and set `memorySearch.fallback` to "none" while it is enabled.',
"agents.defaults.memorySearch.multimodal.enabled":
"Enables image/audio memory indexing from extraPaths. This currently requires Gemini embedding-2, keeps the default memory roots Markdown-only, and disables memory-search fallback providers.",
"agents.defaults.memorySearch.multimodal.modalities":
'Selects which multimodal file types are indexed from extraPaths: "image", "audio", or "all". Keep this narrow to avoid indexing large binary corpora unintentionally.',
"agents.defaults.memorySearch.multimodal.maxFileBytes":
"Sets the maximum bytes allowed per multimodal file before it is skipped during memory indexing. Use this to cap upload cost and indexing latency, or raise it for short high-quality audio clips.",
"agents.defaults.memorySearch.experimental.sessionMemory": "agents.defaults.memorySearch.experimental.sessionMemory":
"Indexes session transcripts into memory search so responses can reference prior chat turns. Keep this off unless transcript recall is needed, because indexing cost and storage usage both increase.", "Indexes session transcripts into memory search so responses can reference prior chat turns. Keep this off unless transcript recall is needed, because indexing cost and storage usage both increase.",
"agents.defaults.memorySearch.provider": "agents.defaults.memorySearch.provider":

View File

@@ -319,6 +319,10 @@ export const FIELD_LABELS: Record<string, string> = {
"agents.defaults.memorySearch.enabled": "Enable Memory Search", "agents.defaults.memorySearch.enabled": "Enable Memory Search",
"agents.defaults.memorySearch.sources": "Memory Search Sources", "agents.defaults.memorySearch.sources": "Memory Search Sources",
"agents.defaults.memorySearch.extraPaths": "Extra Memory Paths", "agents.defaults.memorySearch.extraPaths": "Extra Memory Paths",
"agents.defaults.memorySearch.multimodal": "Memory Search Multimodal",
"agents.defaults.memorySearch.multimodal.enabled": "Enable Memory Search Multimodal",
"agents.defaults.memorySearch.multimodal.modalities": "Memory Search Multimodal Modalities",
"agents.defaults.memorySearch.multimodal.maxFileBytes": "Memory Search Multimodal Max File Bytes",
"agents.defaults.memorySearch.experimental.sessionMemory": "agents.defaults.memorySearch.experimental.sessionMemory":
"Memory Search Session Index (Experimental)", "Memory Search Session Index (Experimental)",
"agents.defaults.memorySearch.provider": "Memory Search Provider", "agents.defaults.memorySearch.provider": "Memory Search Provider",

View File

@@ -319,6 +319,15 @@ export type MemorySearchConfig = {
sources?: Array<"memory" | "sessions">; sources?: Array<"memory" | "sessions">;
/** Extra paths to include in memory search (directories or .md files). */ /** Extra paths to include in memory search (directories or .md files). */
extraPaths?: string[]; extraPaths?: string[];
/** Optional multimodal file indexing for selected extra paths. */
multimodal?: {
/** Enable image/audio embeddings from extraPaths. */
enabled?: boolean;
/** Which non-text file types to index. */
modalities?: Array<"image" | "audio" | "all">;
/** Max bytes allowed per multimodal file before it is skipped. */
maxFileBytes?: number;
};
/** Experimental memory search settings. */ /** Experimental memory search settings. */
experimental?: { experimental?: {
/** Enable session transcript indexing (experimental, default: false). */ /** Enable session transcript indexing (experimental, default: false). */

View File

@@ -553,6 +553,16 @@ export const MemorySearchSchema = z
enabled: z.boolean().optional(), enabled: z.boolean().optional(),
sources: z.array(z.union([z.literal("memory"), z.literal("sessions")])).optional(), sources: z.array(z.union([z.literal("memory"), z.literal("sessions")])).optional(),
extraPaths: z.array(z.string()).optional(), extraPaths: z.array(z.string()).optional(),
multimodal: z
.object({
enabled: z.boolean().optional(),
modalities: z
.array(z.union([z.literal("image"), z.literal("audio"), z.literal("all")]))
.optional(),
maxFileBytes: z.number().int().positive().optional(),
})
.strict()
.optional(),
experimental: z experimental: z
.object({ .object({
sessionMemory: z.boolean().optional(), sessionMemory: z.boolean().optional(),

View File

@@ -12,6 +12,10 @@ const EXT_BY_MIME: Record<string, string> = {
"image/gif": ".gif", "image/gif": ".gif",
"audio/ogg": ".ogg", "audio/ogg": ".ogg",
"audio/mpeg": ".mp3", "audio/mpeg": ".mp3",
"audio/wav": ".wav",
"audio/flac": ".flac",
"audio/aac": ".aac",
"audio/opus": ".opus",
"audio/x-m4a": ".m4a", "audio/x-m4a": ".m4a",
"audio/mp4": ".m4a", "audio/mp4": ".m4a",
"video/mp4": ".mp4", "video/mp4": ".mp4",

View File

@@ -1,4 +1,5 @@
import { estimateUtf8Bytes, splitTextToUtf8ByteLimit } from "./embedding-input-limits.js"; import { estimateUtf8Bytes, splitTextToUtf8ByteLimit } from "./embedding-input-limits.js";
import { hasNonTextEmbeddingParts } from "./embedding-inputs.js";
import { resolveEmbeddingMaxInputTokens } from "./embedding-model-limits.js"; import { resolveEmbeddingMaxInputTokens } from "./embedding-model-limits.js";
import type { EmbeddingProvider } from "./embeddings.js"; import type { EmbeddingProvider } from "./embeddings.js";
import { hashText, type MemoryChunk } from "./internal.js"; import { hashText, type MemoryChunk } from "./internal.js";
@@ -16,6 +17,10 @@ export function enforceEmbeddingMaxInputTokens(
const out: MemoryChunk[] = []; const out: MemoryChunk[] = [];
for (const chunk of chunks) { for (const chunk of chunks) {
if (hasNonTextEmbeddingParts(chunk.embeddingInput)) {
out.push(chunk);
continue;
}
if (estimateUtf8Bytes(chunk.text) <= maxInputTokens) { if (estimateUtf8Bytes(chunk.text) <= maxInputTokens) {
out.push(chunk); out.push(chunk);
continue; continue;
@@ -27,6 +32,7 @@ export function enforceEmbeddingMaxInputTokens(
endLine: chunk.endLine, endLine: chunk.endLine,
text, text,
hash: hashText(text), hash: hashText(text),
embeddingInput: { text },
}); });
} }
} }

View File

@@ -1,3 +1,5 @@
import type { EmbeddingInput } from "./embedding-inputs.js";
// Helpers for enforcing embedding model input size limits. // Helpers for enforcing embedding model input size limits.
// //
// We use UTF-8 byte length as a conservative upper bound for tokenizer output. // We use UTF-8 byte length as a conservative upper bound for tokenizer output.
@@ -11,6 +13,22 @@ export function estimateUtf8Bytes(text: string): number {
return Buffer.byteLength(text, "utf8"); return Buffer.byteLength(text, "utf8");
} }
export function estimateStructuredEmbeddingInputBytes(input: EmbeddingInput): number {
if (!input.parts?.length) {
return estimateUtf8Bytes(input.text);
}
let total = 0;
for (const part of input.parts) {
if (part.type === "text") {
total += estimateUtf8Bytes(part.text);
continue;
}
total += estimateUtf8Bytes(part.mimeType);
total += estimateUtf8Bytes(part.data);
}
return total;
}
export function splitTextToUtf8ByteLimit(text: string, maxUtf8Bytes: number): string[] { export function splitTextToUtf8ByteLimit(text: string, maxUtf8Bytes: number): string[] {
if (maxUtf8Bytes <= 0) { if (maxUtf8Bytes <= 0) {
return [text]; return [text];

View File

@@ -0,0 +1,34 @@
export type EmbeddingInputTextPart = {
type: "text";
text: string;
};
export type EmbeddingInputInlineDataPart = {
type: "inline-data";
mimeType: string;
data: string;
};
export type EmbeddingInputPart = EmbeddingInputTextPart | EmbeddingInputInlineDataPart;
export type EmbeddingInput = {
text: string;
parts?: EmbeddingInputPart[];
};
export function buildTextEmbeddingInput(text: string): EmbeddingInput {
return { text };
}
export function isInlineDataEmbeddingInputPart(
part: EmbeddingInputPart,
): part is EmbeddingInputInlineDataPart {
return part.type === "inline-data";
}
export function hasNonTextEmbeddingParts(input: EmbeddingInput | undefined): boolean {
if (!input?.parts?.length) {
return false;
}
return input.parts.some((part) => isInlineDataEmbeddingInputPart(part));
}

View File

@@ -1,6 +1,7 @@
import { afterEach, describe, expect, it, vi } from "vitest"; import { afterEach, describe, expect, it, vi } from "vitest";
import * as authModule from "../agents/model-auth.js"; import * as authModule from "../agents/model-auth.js";
import { import {
buildGeminiEmbeddingRequest,
buildFileDataPart, buildFileDataPart,
buildGeminiParts, buildGeminiParts,
buildGeminiTextEmbeddingRequest, buildGeminiTextEmbeddingRequest,
@@ -113,6 +114,35 @@ describe("buildGeminiTextEmbeddingRequest", () => {
}); });
}); });
describe("buildGeminiEmbeddingRequest", () => {
it("builds a multimodal request from structured input parts", () => {
expect(
buildGeminiEmbeddingRequest({
input: {
text: "Image file: diagram.png",
parts: [
{ type: "text", text: "Image file: diagram.png" },
{ type: "inline-data", mimeType: "image/png", data: "abc123" },
],
},
taskType: "RETRIEVAL_DOCUMENT",
modelPath: "models/gemini-embedding-2-preview",
outputDimensionality: 1536,
}),
).toEqual({
model: "models/gemini-embedding-2-preview",
content: {
parts: [
{ text: "Image file: diagram.png" },
{ inlineData: { mimeType: "image/png", data: "abc123" } },
],
},
taskType: "RETRIEVAL_DOCUMENT",
outputDimensionality: 1536,
});
});
});
// ---------- Model detection ---------- // ---------- Model detection ----------
describe("isGeminiEmbedding2Model", () => { describe("isGeminiEmbedding2Model", () => {
@@ -341,6 +371,63 @@ describe("gemini-embedding-2-preview provider", () => {
]); ]);
}); });
it("supports multimodal embedBatchInputs requests", async () => {
const fetchMock = createGeminiBatchFetchMock(2);
vi.stubGlobal("fetch", fetchMock);
mockResolvedProviderKey();
const { provider } = await createGeminiEmbeddingProvider({
config: {} as never,
provider: "gemini",
model: "gemini-embedding-2-preview",
fallback: "none",
});
expect(provider.embedBatchInputs).toBeDefined();
await provider.embedBatchInputs?.([
{
text: "Image file: diagram.png",
parts: [
{ type: "text", text: "Image file: diagram.png" },
{ type: "inline-data", mimeType: "image/png", data: "img" },
],
},
{
text: "Audio file: note.wav",
parts: [
{ type: "text", text: "Audio file: note.wav" },
{ type: "inline-data", mimeType: "audio/wav", data: "aud" },
],
},
]);
const body = parseFetchBody(fetchMock);
expect(body.requests).toEqual([
{
model: "models/gemini-embedding-2-preview",
content: {
parts: [
{ text: "Image file: diagram.png" },
{ inlineData: { mimeType: "image/png", data: "img" } },
],
},
taskType: "RETRIEVAL_DOCUMENT",
outputDimensionality: 3072,
},
{
model: "models/gemini-embedding-2-preview",
content: {
parts: [
{ text: "Audio file: note.wav" },
{ inlineData: { mimeType: "audio/wav", data: "aud" } },
],
},
taskType: "RETRIEVAL_DOCUMENT",
outputDimensionality: 3072,
},
]);
});
it("throws for invalid outputDimensionality", async () => { it("throws for invalid outputDimensionality", async () => {
mockResolvedProviderKey(); mockResolvedProviderKey();

View File

@@ -6,6 +6,7 @@ import { requireApiKey, resolveApiKeyForProvider } from "../agents/model-auth.js
import { parseGeminiAuth } from "../infra/gemini-auth.js"; import { parseGeminiAuth } from "../infra/gemini-auth.js";
import type { SsrFPolicy } from "../infra/net/ssrf.js"; import type { SsrFPolicy } from "../infra/net/ssrf.js";
import { sanitizeAndNormalizeEmbedding } from "./embedding-vectors.js"; import { sanitizeAndNormalizeEmbedding } from "./embedding-vectors.js";
import type { EmbeddingInput } from "./embedding-inputs.js";
import { debugEmbeddingsLog } from "./embeddings-debug.js"; import { debugEmbeddingsLog } from "./embeddings-debug.js";
import type { EmbeddingProvider, EmbeddingProviderOptions } from "./embeddings.js"; import type { EmbeddingProvider, EmbeddingProviderOptions } from "./embeddings.js";
import { buildRemoteBaseUrlPolicy, withRemoteHttpResponse } from "./remote-http.js"; import { buildRemoteBaseUrlPolicy, withRemoteHttpResponse } from "./remote-http.js";
@@ -54,12 +55,13 @@ export type GeminiFilePart = {
fileData: { mimeType: string; fileUri: string }; fileData: { mimeType: string; fileUri: string };
}; };
export type GeminiPart = GeminiTextPart | GeminiInlinePart | GeminiFilePart; export type GeminiPart = GeminiTextPart | GeminiInlinePart | GeminiFilePart;
export type GeminiTextEmbeddingRequest = { export type GeminiEmbeddingRequest = {
content: { parts: GeminiTextPart[] }; content: { parts: GeminiPart[] };
taskType: GeminiTaskType; taskType: GeminiTaskType;
outputDimensionality?: number; outputDimensionality?: number;
model?: string; model?: string;
}; };
export type GeminiTextEmbeddingRequest = GeminiEmbeddingRequest;
/** Convert a string or pre-built parts array into `GeminiPart[]`. */ /** Convert a string or pre-built parts array into `GeminiPart[]`. */
export function buildGeminiParts(input: string | GeminiPart[]): GeminiPart[] { export function buildGeminiParts(input: string | GeminiPart[]): GeminiPart[] {
@@ -86,8 +88,30 @@ export function buildGeminiTextEmbeddingRequest(params: {
outputDimensionality?: number; outputDimensionality?: number;
modelPath?: string; modelPath?: string;
}): GeminiTextEmbeddingRequest { }): GeminiTextEmbeddingRequest {
const request: GeminiTextEmbeddingRequest = { return buildGeminiEmbeddingRequest({
content: { parts: [{ text: params.text }] }, input: { text: params.text },
taskType: params.taskType,
outputDimensionality: params.outputDimensionality,
modelPath: params.modelPath,
});
}
export function buildGeminiEmbeddingRequest(params: {
input: EmbeddingInput;
taskType: GeminiTaskType;
outputDimensionality?: number;
modelPath?: string;
}): GeminiEmbeddingRequest {
const request: GeminiEmbeddingRequest = {
content: {
parts: params.input.parts?.map((part) =>
part.type === "text"
? ({ text: part.text } satisfies GeminiTextPart)
: ({
inlineData: { mimeType: part.mimeType, data: part.data },
} satisfies GeminiInlinePart),
) ?? [{ text: params.input.text }],
},
taskType: params.taskType, taskType: params.taskType,
}; };
if (params.modelPath) { if (params.modelPath) {
@@ -143,7 +167,7 @@ function resolveRemoteApiKey(remoteApiKey: unknown): string | undefined {
return trimmed; return trimmed;
} }
function normalizeGeminiModel(model: string): string { export function normalizeGeminiModel(model: string): string {
const trimmed = model.trim(); const trimmed = model.trim();
if (!trimmed) { if (!trimmed) {
return DEFAULT_GEMINI_EMBEDDING_MODEL; return DEFAULT_GEMINI_EMBEDDING_MODEL;
@@ -158,6 +182,46 @@ function normalizeGeminiModel(model: string): string {
return withoutPrefix; return withoutPrefix;
} }
async function fetchGeminiEmbeddingPayload(params: {
client: GeminiEmbeddingClient;
endpoint: string;
body: unknown;
}): Promise<{
embedding?: { values?: number[] };
embeddings?: Array<{ values?: number[] }>;
}> {
return await executeWithApiKeyRotation({
provider: "google",
apiKeys: params.client.apiKeys,
execute: async (apiKey) => {
const authHeaders = parseGeminiAuth(apiKey);
const headers = {
...authHeaders.headers,
...params.client.headers,
};
return await withRemoteHttpResponse({
url: params.endpoint,
ssrfPolicy: params.client.ssrfPolicy,
init: {
method: "POST",
headers,
body: JSON.stringify(params.body),
},
onResponse: async (res) => {
if (!res.ok) {
const text = await res.text();
throw new Error(`gemini embeddings failed: ${res.status} ${text}`);
}
return (await res.json()) as {
embedding?: { values?: number[] };
embeddings?: Array<{ values?: number[] }>;
};
},
});
},
});
}
function normalizeGeminiBaseUrl(raw: string): string { function normalizeGeminiBaseUrl(raw: string): string {
const trimmed = raw.replace(/\/+$/, ""); const trimmed = raw.replace(/\/+$/, "");
const openAiIndex = trimmed.indexOf("/openai"); const openAiIndex = trimmed.indexOf("/openai");
@@ -181,71 +245,50 @@ export async function createGeminiEmbeddingProvider(
const isV2 = isGeminiEmbedding2Model(client.model); const isV2 = isGeminiEmbedding2Model(client.model);
const outputDimensionality = client.outputDimensionality; const outputDimensionality = client.outputDimensionality;
const fetchWithGeminiAuth = async (apiKey: string, endpoint: string, body: unknown) => {
const authHeaders = parseGeminiAuth(apiKey);
const headers = {
...authHeaders.headers,
...client.headers,
};
const payload = await withRemoteHttpResponse({
url: endpoint,
ssrfPolicy: client.ssrfPolicy,
init: {
method: "POST",
headers,
body: JSON.stringify(body),
},
onResponse: async (res) => {
if (!res.ok) {
const text = await res.text();
throw new Error(`gemini embeddings failed: ${res.status} ${text}`);
}
return (await res.json()) as {
embedding?: { values?: number[] };
embeddings?: Array<{ values?: number[] }>;
};
},
});
return payload;
};
const embedQuery = async (text: string): Promise<number[]> => { const embedQuery = async (text: string): Promise<number[]> => {
if (!text.trim()) { if (!text.trim()) {
return []; return [];
} }
const body = buildGeminiTextEmbeddingRequest({ const payload = await fetchGeminiEmbeddingPayload({
text, client,
taskType: options.taskType ?? "RETRIEVAL_QUERY", endpoint: embedUrl,
outputDimensionality: isV2 ? outputDimensionality : undefined, body: buildGeminiTextEmbeddingRequest({
}); text,
const payload = await executeWithApiKeyRotation({ taskType: options.taskType ?? "RETRIEVAL_QUERY",
provider: "google", outputDimensionality: isV2 ? outputDimensionality : undefined,
apiKeys: client.apiKeys, }),
execute: (apiKey) => fetchWithGeminiAuth(apiKey, embedUrl, body),
}); });
return sanitizeAndNormalizeEmbedding(payload.embedding?.values ?? []); return sanitizeAndNormalizeEmbedding(payload.embedding?.values ?? []);
}; };
const embedBatch = async (texts: string[]): Promise<number[][]> => { const embedBatchInputs = async (inputs: EmbeddingInput[]): Promise<number[][]> => {
if (texts.length === 0) { if (inputs.length === 0) {
return []; return [];
} }
const requests = texts.map((text) => const payload = await fetchGeminiEmbeddingPayload({
buildGeminiTextEmbeddingRequest({ client,
text, endpoint: batchUrl,
modelPath: client.modelPath, body: {
taskType: options.taskType ?? "RETRIEVAL_DOCUMENT", requests: inputs.map((input) =>
outputDimensionality: isV2 ? outputDimensionality : undefined, buildGeminiEmbeddingRequest({
}), input,
); modelPath: client.modelPath,
const batchBody = { requests }; taskType: options.taskType ?? "RETRIEVAL_DOCUMENT",
const payload = await executeWithApiKeyRotation({ outputDimensionality: isV2 ? outputDimensionality : undefined,
provider: "google", }),
apiKeys: client.apiKeys, ),
execute: (apiKey) => fetchWithGeminiAuth(apiKey, batchUrl, batchBody), },
}); });
const embeddings = Array.isArray(payload.embeddings) ? payload.embeddings : []; const embeddings = Array.isArray(payload.embeddings) ? payload.embeddings : [];
return texts.map((_, index) => sanitizeAndNormalizeEmbedding(embeddings[index]?.values ?? [])); return inputs.map((_, index) => sanitizeAndNormalizeEmbedding(embeddings[index]?.values ?? []));
};
const embedBatch = async (texts: string[]): Promise<number[][]> => {
return await embedBatchInputs(
texts.map((text) => ({
text,
})),
);
}; };
return { return {
@@ -255,6 +298,7 @@ export async function createGeminiEmbeddingProvider(
maxInputTokens: GEMINI_MAX_INPUT_TOKENS[client.model], maxInputTokens: GEMINI_MAX_INPUT_TOKENS[client.model],
embedQuery, embedQuery,
embedBatch, embedBatch,
embedBatchInputs,
}, },
client, client,
}; };

View File

@@ -5,6 +5,7 @@ import type { SecretInput } from "../config/types.secrets.js";
import { formatErrorMessage } from "../infra/errors.js"; import { formatErrorMessage } from "../infra/errors.js";
import { resolveUserPath } from "../utils.js"; import { resolveUserPath } from "../utils.js";
import { sanitizeAndNormalizeEmbedding } from "./embedding-vectors.js"; import { sanitizeAndNormalizeEmbedding } from "./embedding-vectors.js";
import type { EmbeddingInput } from "./embedding-inputs.js";
import { import {
createGeminiEmbeddingProvider, createGeminiEmbeddingProvider,
type GeminiEmbeddingClient, type GeminiEmbeddingClient,
@@ -31,6 +32,7 @@ export type EmbeddingProvider = {
maxInputTokens?: number; maxInputTokens?: number;
embedQuery: (text: string) => Promise<number[]>; embedQuery: (text: string) => Promise<number[]>;
embedBatch: (texts: string[]) => Promise<number[][]>; embedBatch: (texts: string[]) => Promise<number[][]>;
embedBatchInputs?: (inputs: EmbeddingInput[]) => Promise<number[][]>;
}; };
export type EmbeddingProviderId = "openai" | "local" | "gemini" | "voyage" | "mistral" | "ollama"; export type EmbeddingProviderId = "openai" | "local" | "gemini" | "voyage" | "mistral" | "ollama";

View File

@@ -1,3 +1,4 @@
import { randomUUID } from "node:crypto";
import fs from "node:fs/promises"; import fs from "node:fs/promises";
import os from "node:os"; import os from "node:os";
import path from "node:path"; import path from "node:path";
@@ -6,6 +7,7 @@ import { getMemorySearchManager, type MemoryIndexManager } from "./index.js";
import "./test-runtime-mocks.js"; import "./test-runtime-mocks.js";
let embedBatchCalls = 0; let embedBatchCalls = 0;
let embedBatchInputCalls = 0;
let providerCalls: Array<{ provider?: string; model?: string; outputDimensionality?: number }> = []; let providerCalls: Array<{ provider?: string; model?: string; outputDimensionality?: number }> = [];
vi.mock("./embeddings.js", () => { vi.mock("./embeddings.js", () => {
@@ -13,7 +15,9 @@ vi.mock("./embeddings.js", () => {
const lower = text.toLowerCase(); const lower = text.toLowerCase();
const alpha = lower.split("alpha").length - 1; const alpha = lower.split("alpha").length - 1;
const beta = lower.split("beta").length - 1; const beta = lower.split("beta").length - 1;
return [alpha, beta]; const image = lower.split("image").length - 1;
const audio = lower.split("audio").length - 1;
return [alpha, beta, image, audio];
}; };
return { return {
createEmbeddingProvider: async (options: { createEmbeddingProvider: async (options: {
@@ -38,6 +42,32 @@ vi.mock("./embeddings.js", () => {
embedBatchCalls += 1; embedBatchCalls += 1;
return texts.map(embedText); return texts.map(embedText);
}, },
...(providerId === "gemini"
? {
embedBatchInputs: async (
inputs: Array<{
text: string;
parts?: Array<
{ type: "text"; text: string } | { type: "inline-data"; mimeType: string }
>;
}>,
) => {
embedBatchInputCalls += 1;
return inputs.map((input) => {
const mimeType = input.parts?.find(
(part) => part.type === "inline-data",
)?.mimeType;
if (mimeType?.startsWith("image/")) {
return [0, 0, 1, 0];
}
if (mimeType?.startsWith("audio/")) {
return [0, 0, 0, 1];
}
return embedText(input.text);
});
},
}
: {}),
}, },
...(providerId === "gemini" ...(providerId === "gemini"
? { ? {
@@ -64,6 +94,7 @@ describe("memory index", () => {
let indexVectorPath = ""; let indexVectorPath = "";
let indexMainPath = ""; let indexMainPath = "";
let indexExtraPath = ""; let indexExtraPath = "";
let indexMultimodalPath = "";
let indexStatusPath = ""; let indexStatusPath = "";
let indexSourceChangePath = ""; let indexSourceChangePath = "";
let indexModelPath = ""; let indexModelPath = "";
@@ -97,6 +128,7 @@ describe("memory index", () => {
indexMainPath = path.join(workspaceDir, "index-main.sqlite"); indexMainPath = path.join(workspaceDir, "index-main.sqlite");
indexVectorPath = path.join(workspaceDir, "index-vector.sqlite"); indexVectorPath = path.join(workspaceDir, "index-vector.sqlite");
indexExtraPath = path.join(workspaceDir, "index-extra.sqlite"); indexExtraPath = path.join(workspaceDir, "index-extra.sqlite");
indexMultimodalPath = path.join(workspaceDir, "index-multimodal.sqlite");
indexStatusPath = path.join(workspaceDir, "index-status.sqlite"); indexStatusPath = path.join(workspaceDir, "index-status.sqlite");
indexSourceChangePath = path.join(workspaceDir, "index-source-change.sqlite"); indexSourceChangePath = path.join(workspaceDir, "index-source-change.sqlite");
indexModelPath = path.join(workspaceDir, "index-model-change.sqlite"); indexModelPath = path.join(workspaceDir, "index-model-change.sqlite");
@@ -119,6 +151,7 @@ describe("memory index", () => {
// Keep atomic reindex tests on the safe path. // Keep atomic reindex tests on the safe path.
vi.stubEnv("OPENCLAW_TEST_MEMORY_UNSAFE_REINDEX", "1"); vi.stubEnv("OPENCLAW_TEST_MEMORY_UNSAFE_REINDEX", "1");
embedBatchCalls = 0; embedBatchCalls = 0;
embedBatchInputCalls = 0;
providerCalls = []; providerCalls = [];
// Keep the workspace stable to allow manager reuse across tests. // Keep the workspace stable to allow manager reuse across tests.
@@ -149,6 +182,11 @@ describe("memory index", () => {
provider?: "openai" | "gemini"; provider?: "openai" | "gemini";
model?: string; model?: string;
outputDimensionality?: number; outputDimensionality?: number;
multimodal?: {
enabled?: boolean;
modalities?: Array<"image" | "audio" | "all">;
maxFileBytes?: number;
};
vectorEnabled?: boolean; vectorEnabled?: boolean;
cacheEnabled?: boolean; cacheEnabled?: boolean;
minScore?: number; minScore?: number;
@@ -172,6 +210,7 @@ describe("memory index", () => {
}, },
cache: params.cacheEnabled ? { enabled: true } : undefined, cache: params.cacheEnabled ? { enabled: true } : undefined,
extraPaths: params.extraPaths, extraPaths: params.extraPaths,
multimodal: params.multimodal,
sources: params.sources, sources: params.sources,
experimental: { sessionMemory: params.sessionMemory ?? false }, experimental: { sessionMemory: params.sessionMemory ?? false },
}, },
@@ -247,6 +286,31 @@ describe("memory index", () => {
); );
}); });
it("indexes multimodal image and audio files from extra paths with Gemini structured inputs", async () => {
const mediaDir = path.join(workspaceDir, "media-memory");
await fs.mkdir(mediaDir, { recursive: true });
await fs.writeFile(path.join(mediaDir, "diagram.png"), Buffer.from("png"));
await fs.writeFile(path.join(mediaDir, "meeting.wav"), Buffer.from("wav"));
const cfg = createCfg({
storePath: indexMultimodalPath,
provider: "gemini",
model: "gemini-embedding-2-preview",
extraPaths: [mediaDir],
multimodal: { enabled: true, modalities: ["image", "audio"] },
});
const manager = await getPersistentManager(cfg);
await manager.sync({ reason: "test" });
expect(embedBatchInputCalls).toBeGreaterThan(0);
const imageResults = await manager.search("image");
expect(imageResults.some((result) => result.path.endsWith("diagram.png"))).toBe(true);
const audioResults = await manager.search("audio");
expect(audioResults.some((result) => result.path.endsWith("meeting.wav"))).toBe(true);
});
it("keeps dirty false in status-only manager after prior indexing", async () => { it("keeps dirty false in status-only manager after prior indexing", async () => {
const cfg = createCfg({ storePath: indexStatusPath }); const cfg = createCfg({ storePath: indexStatusPath });
@@ -433,6 +497,82 @@ describe("memory index", () => {
await secondManager.close?.(); await secondManager.close?.();
}); });
it("reindexes when extraPaths change", async () => {
const storePath = path.join(workspaceDir, `index-scope-extra-${randomUUID()}.sqlite`);
const firstExtraDir = path.join(workspaceDir, "scope-extra-a");
const secondExtraDir = path.join(workspaceDir, "scope-extra-b");
await fs.rm(firstExtraDir, { recursive: true, force: true });
await fs.rm(secondExtraDir, { recursive: true, force: true });
await fs.mkdir(firstExtraDir, { recursive: true });
await fs.mkdir(secondExtraDir, { recursive: true });
await fs.writeFile(path.join(firstExtraDir, "a.md"), "alpha only");
await fs.writeFile(path.join(secondExtraDir, "b.md"), "beta only");
const first = await getMemorySearchManager({
cfg: createCfg({
storePath,
extraPaths: [firstExtraDir],
}),
agentId: "main",
});
const firstManager = requireManager(first);
await firstManager.sync?.({ reason: "test" });
await firstManager.close?.();
const second = await getMemorySearchManager({
cfg: createCfg({
storePath,
extraPaths: [secondExtraDir],
}),
agentId: "main",
});
const secondManager = requireManager(second);
await secondManager.sync?.({ reason: "test" });
const results = await secondManager.search("beta");
expect(results.some((result) => result.path.endsWith("scope-extra-b/b.md"))).toBe(true);
expect(results.some((result) => result.path.endsWith("scope-extra-a/a.md"))).toBe(false);
await secondManager.close?.();
});
it("reindexes when multimodal settings change", async () => {
const storePath = path.join(workspaceDir, `index-scope-multimodal-${randomUUID()}.sqlite`);
const mediaDir = path.join(workspaceDir, "scope-media");
await fs.rm(mediaDir, { recursive: true, force: true });
await fs.mkdir(mediaDir, { recursive: true });
await fs.writeFile(path.join(mediaDir, "diagram.png"), Buffer.from("png"));
const first = await getMemorySearchManager({
cfg: createCfg({
storePath,
provider: "gemini",
model: "gemini-embedding-2-preview",
extraPaths: [mediaDir],
}),
agentId: "main",
});
const firstManager = requireManager(first);
await firstManager.sync?.({ reason: "test" });
const multimodalCallsAfterFirstSync = embedBatchInputCalls;
await firstManager.close?.();
const second = await getMemorySearchManager({
cfg: createCfg({
storePath,
provider: "gemini",
model: "gemini-embedding-2-preview",
extraPaths: [mediaDir],
multimodal: { enabled: true, modalities: ["image"] },
}),
agentId: "main",
});
const secondManager = requireManager(second);
await secondManager.sync?.({ reason: "test" });
expect(embedBatchInputCalls).toBeGreaterThan(multimodalCallsAfterFirstSync);
const results = await secondManager.search("image");
expect(results.some((result) => result.path.endsWith("scope-media/diagram.png"))).toBe(true);
await secondManager.close?.();
});
it("reuses cached embeddings on forced reindex", async () => { it("reuses cached embeddings on forced reindex", async () => {
const cfg = createCfg({ storePath: indexMainPath, cacheEnabled: true }); const cfg = createCfg({ storePath: indexMainPath, cacheEnabled: true });
const manager = await getPersistentManager(cfg); const manager = await getPersistentManager(cfg);

View File

@@ -9,6 +9,10 @@ import {
normalizeExtraMemoryPaths, normalizeExtraMemoryPaths,
remapChunkLines, remapChunkLines,
} from "./internal.js"; } from "./internal.js";
import {
DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES,
type MemoryMultimodalSettings,
} from "./multimodal.js";
function setupTempDirLifecycle(prefix: string): () => string { function setupTempDirLifecycle(prefix: string): () => string {
let tmpDir = ""; let tmpDir = "";
@@ -38,6 +42,11 @@ describe("normalizeExtraMemoryPaths", () => {
describe("listMemoryFiles", () => { describe("listMemoryFiles", () => {
const getTmpDir = setupTempDirLifecycle("memory-test-"); const getTmpDir = setupTempDirLifecycle("memory-test-");
const multimodal: MemoryMultimodalSettings = {
enabled: true,
modalities: ["image", "audio"],
maxFileBytes: DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES,
};
it("includes files from additional paths (directory)", async () => { it("includes files from additional paths (directory)", async () => {
const tmpDir = getTmpDir(); const tmpDir = getTmpDir();
@@ -131,10 +140,29 @@ describe("listMemoryFiles", () => {
const memoryMatches = files.filter((file) => file.endsWith("MEMORY.md")); const memoryMatches = files.filter((file) => file.endsWith("MEMORY.md"));
expect(memoryMatches).toHaveLength(1); expect(memoryMatches).toHaveLength(1);
}); });
it("includes image and audio files from extra paths when multimodal is enabled", async () => {
const tmpDir = getTmpDir();
const extraDir = path.join(tmpDir, "media");
await fs.mkdir(extraDir, { recursive: true });
await fs.writeFile(path.join(extraDir, "diagram.png"), Buffer.from("png"));
await fs.writeFile(path.join(extraDir, "note.wav"), Buffer.from("wav"));
await fs.writeFile(path.join(extraDir, "ignore.bin"), Buffer.from("bin"));
const files = await listMemoryFiles(tmpDir, [extraDir], multimodal);
expect(files.some((file) => file.endsWith("diagram.png"))).toBe(true);
expect(files.some((file) => file.endsWith("note.wav"))).toBe(true);
expect(files.some((file) => file.endsWith("ignore.bin"))).toBe(false);
});
}); });
describe("buildFileEntry", () => { describe("buildFileEntry", () => {
const getTmpDir = setupTempDirLifecycle("memory-build-entry-"); const getTmpDir = setupTempDirLifecycle("memory-build-entry-");
const multimodal: MemoryMultimodalSettings = {
enabled: true,
modalities: ["image", "audio"],
maxFileBytes: DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES,
};
it("returns null when the file disappears before reading", async () => { it("returns null when the file disappears before reading", async () => {
const tmpDir = getTmpDir(); const tmpDir = getTmpDir();
@@ -154,6 +182,26 @@ describe("buildFileEntry", () => {
expect(entry?.path).toBe("note.md"); expect(entry?.path).toBe("note.md");
expect(entry?.size).toBeGreaterThan(0); expect(entry?.size).toBeGreaterThan(0);
}); });
it("returns multimodal metadata for eligible image files", async () => {
const tmpDir = getTmpDir();
const target = path.join(tmpDir, "diagram.png");
await fs.writeFile(target, Buffer.from("png"));
const entry = await buildFileEntry(target, tmpDir, multimodal);
expect(entry).toMatchObject({
path: "diagram.png",
kind: "multimodal",
modality: "image",
mimeType: "image/png",
contentText: "Image file: diagram.png",
});
expect(entry?.embeddingInput?.parts).toEqual([
{ type: "text", text: "Image file: diagram.png" },
expect.objectContaining({ type: "inline-data", mimeType: "image/png" }),
]);
});
}); });
describe("chunkMarkdown", () => { describe("chunkMarkdown", () => {

View File

@@ -2,8 +2,16 @@ import crypto from "node:crypto";
import fsSync from "node:fs"; import fsSync from "node:fs";
import fs from "node:fs/promises"; import fs from "node:fs/promises";
import path from "node:path"; import path from "node:path";
import { detectMime } from "../media/mime.js";
import { runTasksWithConcurrency } from "../utils/run-with-concurrency.js"; import { runTasksWithConcurrency } from "../utils/run-with-concurrency.js";
import { buildTextEmbeddingInput, type EmbeddingInput } from "./embedding-inputs.js";
import { isFileMissingError } from "./fs-utils.js"; import { isFileMissingError } from "./fs-utils.js";
import {
classifyMemoryMultimodalPath,
isMemoryMultimodalEnabled,
type MemoryMultimodalModality,
type MemoryMultimodalSettings,
} from "./multimodal.js";
export type MemoryFileEntry = { export type MemoryFileEntry = {
path: string; path: string;
@@ -11,6 +19,11 @@ export type MemoryFileEntry = {
mtimeMs: number; mtimeMs: number;
size: number; size: number;
hash: string; hash: string;
kind?: "markdown" | "multimodal";
contentText?: string;
embeddingInput?: EmbeddingInput;
modality?: MemoryMultimodalModality;
mimeType?: string;
}; };
export type MemoryChunk = { export type MemoryChunk = {
@@ -18,6 +31,13 @@ export type MemoryChunk = {
endLine: number; endLine: number;
text: string; text: string;
hash: string; hash: string;
embeddingInput?: EmbeddingInput;
};
const DISABLED_MULTIMODAL_SETTINGS: MemoryMultimodalSettings = {
enabled: false,
modalities: [],
maxFileBytes: 0,
}; };
export function ensureDir(dir: string): string { export function ensureDir(dir: string): string {
@@ -56,7 +76,16 @@ export function isMemoryPath(relPath: string): boolean {
return normalized.startsWith("memory/"); return normalized.startsWith("memory/");
} }
async function walkDir(dir: string, files: string[]) { function isAllowedMemoryFilePath(filePath: string, multimodal?: MemoryMultimodalSettings): boolean {
if (filePath.endsWith(".md")) {
return true;
}
return (
classifyMemoryMultimodalPath(filePath, multimodal ?? DISABLED_MULTIMODAL_SETTINGS) !== null
);
}
async function walkDir(dir: string, files: string[], multimodal?: MemoryMultimodalSettings) {
const entries = await fs.readdir(dir, { withFileTypes: true }); const entries = await fs.readdir(dir, { withFileTypes: true });
for (const entry of entries) { for (const entry of entries) {
const full = path.join(dir, entry.name); const full = path.join(dir, entry.name);
@@ -64,13 +93,13 @@ async function walkDir(dir: string, files: string[]) {
continue; continue;
} }
if (entry.isDirectory()) { if (entry.isDirectory()) {
await walkDir(full, files); await walkDir(full, files, multimodal);
continue; continue;
} }
if (!entry.isFile()) { if (!entry.isFile()) {
continue; continue;
} }
if (!entry.name.endsWith(".md")) { if (!isAllowedMemoryFilePath(full, multimodal)) {
continue; continue;
} }
files.push(full); files.push(full);
@@ -80,6 +109,7 @@ async function walkDir(dir: string, files: string[]) {
export async function listMemoryFiles( export async function listMemoryFiles(
workspaceDir: string, workspaceDir: string,
extraPaths?: string[], extraPaths?: string[],
multimodal?: MemoryMultimodalSettings,
): Promise<string[]> { ): Promise<string[]> {
const result: string[] = []; const result: string[] = [];
const memoryFile = path.join(workspaceDir, "MEMORY.md"); const memoryFile = path.join(workspaceDir, "MEMORY.md");
@@ -117,10 +147,10 @@ export async function listMemoryFiles(
continue; continue;
} }
if (stat.isDirectory()) { if (stat.isDirectory()) {
await walkDir(inputPath, result); await walkDir(inputPath, result, multimodal);
continue; continue;
} }
if (stat.isFile() && inputPath.endsWith(".md")) { if (stat.isFile() && isAllowedMemoryFilePath(inputPath, multimodal)) {
result.push(inputPath); result.push(inputPath);
} }
} catch {} } catch {}
@@ -152,6 +182,7 @@ export function hashText(value: string): string {
export async function buildFileEntry( export async function buildFileEntry(
absPath: string, absPath: string,
workspaceDir: string, workspaceDir: string,
multimodal?: MemoryMultimodalSettings,
): Promise<MemoryFileEntry | null> { ): Promise<MemoryFileEntry | null> {
let stat; let stat;
try { try {
@@ -162,6 +193,63 @@ export async function buildFileEntry(
} }
throw err; throw err;
} }
const normalizedPath = path.relative(workspaceDir, absPath).replace(/\\/g, "/");
const multimodalSettings = multimodal ?? DISABLED_MULTIMODAL_SETTINGS;
const modality = classifyMemoryMultimodalPath(absPath, multimodalSettings);
if (modality) {
if (!isMemoryMultimodalEnabled(multimodalSettings)) {
return null;
}
if (stat.size > multimodalSettings.maxFileBytes) {
return null;
}
let buffer: Buffer;
try {
buffer = await fs.readFile(absPath);
} catch (err) {
if (isFileMissingError(err)) {
return null;
}
throw err;
}
const mimeType = await detectMime({ buffer: buffer.subarray(0, 512), filePath: absPath });
if (!mimeType || !mimeType.startsWith(`${modality}/`)) {
return null;
}
const contentText = `${modality === "image" ? "Image" : "Audio"} file: ${normalizedPath}`;
const embeddingInput: EmbeddingInput = {
text: contentText,
parts: [
{ type: "text", text: contentText },
{
type: "inline-data",
mimeType,
data: buffer.toString("base64"),
},
],
};
const dataHash = crypto.createHash("sha256").update(buffer).digest("hex");
const chunkHash = hashText(
JSON.stringify({
path: normalizedPath,
contentText,
mimeType,
dataHash,
}),
);
return {
path: normalizedPath,
absPath,
mtimeMs: stat.mtimeMs,
size: stat.size,
hash: chunkHash,
kind: "multimodal",
contentText,
embeddingInput,
modality,
mimeType,
};
}
let content: string; let content: string;
try { try {
content = await fs.readFile(absPath, "utf-8"); content = await fs.readFile(absPath, "utf-8");
@@ -173,11 +261,12 @@ export async function buildFileEntry(
} }
const hash = hashText(content); const hash = hashText(content);
return { return {
path: path.relative(workspaceDir, absPath).replace(/\\/g, "/"), path: normalizedPath,
absPath, absPath,
mtimeMs: stat.mtimeMs, mtimeMs: stat.mtimeMs,
size: stat.size, size: stat.size,
hash, hash,
kind: "markdown",
}; };
} }
@@ -213,6 +302,7 @@ export function chunkMarkdown(
endLine, endLine,
text, text,
hash: hashText(text), hash: hashText(text),
embeddingInput: buildTextEmbeddingInput(text),
}); });
}; };

View File

@@ -8,8 +8,12 @@ import {
} from "./batch-openai.js"; } from "./batch-openai.js";
import { type VoyageBatchRequest, runVoyageEmbeddingBatches } from "./batch-voyage.js"; import { type VoyageBatchRequest, runVoyageEmbeddingBatches } from "./batch-voyage.js";
import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js"; import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js";
import { estimateUtf8Bytes } from "./embedding-input-limits.js"; import {
import { buildGeminiTextEmbeddingRequest } from "./embeddings-gemini.js"; estimateStructuredEmbeddingInputBytes,
estimateUtf8Bytes,
} from "./embedding-input-limits.js";
import { type EmbeddingInput, hasNonTextEmbeddingParts } from "./embedding-inputs.js";
import { buildGeminiEmbeddingRequest } from "./embeddings-gemini.js";
import { import {
chunkMarkdown, chunkMarkdown,
hashText, hashText,
@@ -53,7 +57,9 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
let currentTokens = 0; let currentTokens = 0;
for (const chunk of chunks) { for (const chunk of chunks) {
const estimate = estimateUtf8Bytes(chunk.text); const estimate = chunk.embeddingInput
? estimateStructuredEmbeddingInputBytes(chunk.embeddingInput)
: estimateUtf8Bytes(chunk.text);
const wouldExceed = const wouldExceed =
current.length > 0 && currentTokens + estimate > EMBEDDING_BATCH_MAX_TOKENS; current.length > 0 && currentTokens + estimate > EMBEDDING_BATCH_MAX_TOKENS;
if (wouldExceed) { if (wouldExceed) {
@@ -188,9 +194,22 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
const missingChunks = missing.map((m) => m.chunk); const missingChunks = missing.map((m) => m.chunk);
const batches = this.buildEmbeddingBatches(missingChunks); const batches = this.buildEmbeddingBatches(missingChunks);
const toCache: Array<{ hash: string; embedding: number[] }> = []; const toCache: Array<{ hash: string; embedding: number[] }> = [];
const provider = this.provider;
if (!provider) {
throw new Error("Cannot embed batch in FTS-only mode (no embedding provider)");
}
let cursor = 0; let cursor = 0;
for (const batch of batches) { for (const batch of batches) {
const batchEmbeddings = await this.embedBatchWithRetry(batch.map((chunk) => chunk.text)); const inputs = batch.map((chunk) => chunk.embeddingInput ?? { text: chunk.text });
const hasStructuredInputs = inputs.some((input) => hasNonTextEmbeddingParts(input));
if (hasStructuredInputs && !provider.embedBatchInputs) {
throw new Error(
`Embedding provider "${provider.id}" does not support multimodal memory inputs.`,
);
}
const batchEmbeddings = hasStructuredInputs
? await this.embedBatchInputsWithRetry(inputs)
: await this.embedBatchWithRetry(batch.map((chunk) => chunk.text));
for (let i = 0; i < batch.length; i += 1) { for (let i = 0; i < batch.length; i += 1) {
const item = missing[cursor + i]; const item = missing[cursor + i];
const embedding = batchEmbeddings[i] ?? []; const embedding = batchEmbeddings[i] ?? [];
@@ -476,6 +495,9 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
source: MemorySource, source: MemorySource,
): Promise<number[][]> { ): Promise<number[][]> {
const gemini = this.gemini; const gemini = this.gemini;
if (chunks.some((chunk) => hasNonTextEmbeddingParts(chunk.embeddingInput))) {
return await this.embedChunksInBatches(chunks);
}
return await this.embedChunksWithProviderBatch<GeminiBatchRequest>({ return await this.embedChunksWithProviderBatch<GeminiBatchRequest>({
chunks, chunks,
entry, entry,
@@ -483,9 +505,10 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
provider: "gemini", provider: "gemini",
enabled: Boolean(gemini), enabled: Boolean(gemini),
buildRequest: (chunk) => ({ buildRequest: (chunk) => ({
request: buildGeminiTextEmbeddingRequest({ request: buildGeminiEmbeddingRequest({
text: chunk.text, input: chunk.embeddingInput ?? { text: chunk.text },
taskType: "RETRIEVAL_DOCUMENT", taskType: "RETRIEVAL_DOCUMENT",
modelPath: this.gemini?.modelPath,
outputDimensionality: this.gemini?.outputDimensionality, outputDimensionality: this.gemini?.outputDimensionality,
}), }),
}), }),
@@ -536,6 +559,45 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
} }
} }
protected async embedBatchInputsWithRetry(inputs: EmbeddingInput[]): Promise<number[][]> {
if (inputs.length === 0) {
return [];
}
if (!this.provider?.embedBatchInputs) {
return await this.embedBatchWithRetry(inputs.map((input) => input.text));
}
let attempt = 0;
let delayMs = EMBEDDING_RETRY_BASE_DELAY_MS;
while (true) {
try {
const timeoutMs = this.resolveEmbeddingTimeout("batch");
log.debug("memory embeddings: structured batch start", {
provider: this.provider.id,
items: inputs.length,
timeoutMs,
});
return await this.withTimeout(
this.provider.embedBatchInputs(inputs),
timeoutMs,
`memory embeddings batch timed out after ${Math.round(timeoutMs / 1000)}s`,
);
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
if (!this.isRetryableEmbeddingError(message) || attempt >= EMBEDDING_RETRY_MAX_ATTEMPTS) {
throw err;
}
const waitMs = Math.min(
EMBEDDING_RETRY_MAX_DELAY_MS,
Math.round(delayMs * (1 + Math.random() * 0.2)),
);
log.warn(`memory embeddings rate limited; retrying structured batch in ${waitMs}ms`);
await new Promise((resolve) => setTimeout(resolve, waitMs));
delayMs *= 2;
attempt += 1;
}
}
}
private isRetryableEmbeddingError(message: string): boolean { private isRetryableEmbeddingError(message: string): boolean {
return /(rate[_ ]limit|too many requests|429|resource has been exhausted|5\d\d|cloudflare|tokens per day)/i.test( return /(rate[_ ]limit|too many requests|429|resource has been exhausted|5\d\d|cloudflare|tokens per day)/i.test(
message, message,
@@ -708,16 +770,29 @@ export abstract class MemoryManagerEmbeddingOps extends MemoryManagerSyncOps {
return; return;
} }
const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8")); let chunks: MemoryChunk[];
const chunks = enforceEmbeddingMaxInputTokens( if ("kind" in entry && entry.kind === "multimodal" && entry.embeddingInput) {
this.provider, chunks = [
chunkMarkdown(content, this.settings.chunking).filter( {
(chunk) => chunk.text.trim().length > 0, startLine: 1,
), endLine: 1,
EMBEDDING_BATCH_MAX_TOKENS, text: entry.contentText ?? entry.embeddingInput.text,
); hash: entry.hash,
if (options.source === "sessions" && "lineMap" in entry) { embeddingInput: entry.embeddingInput,
remapChunkLines(chunks, entry.lineMap); },
];
} else {
const content = options.content ?? (await fs.readFile(entry.absPath, "utf-8"));
chunks = enforceEmbeddingMaxInputTokens(
this.provider,
chunkMarkdown(content, this.settings.chunking).filter(
(chunk) => chunk.text.trim().length > 0,
),
EMBEDDING_BATCH_MAX_TOKENS,
);
if (options.source === "sessions" && "lineMap" in entry) {
remapChunkLines(chunks, entry.lineMap);
}
} }
const embeddings = this.batch.enabled const embeddings = this.batch.enabled
? await this.embedChunksWithBatch(chunks, entry, options.source) ? await this.embedChunksWithBatch(chunks, entry, options.source)

View File

@@ -29,12 +29,14 @@ import { isFileMissingError } from "./fs-utils.js";
import { import {
buildFileEntry, buildFileEntry,
ensureDir, ensureDir,
hashText,
listMemoryFiles, listMemoryFiles,
normalizeExtraMemoryPaths, normalizeExtraMemoryPaths,
runWithConcurrency, runWithConcurrency,
} from "./internal.js"; } from "./internal.js";
import { type MemoryFileEntry } from "./internal.js"; import { type MemoryFileEntry } from "./internal.js";
import { ensureMemoryIndexSchema } from "./memory-schema.js"; import { ensureMemoryIndexSchema } from "./memory-schema.js";
import { classifyMemoryMultimodalPath } from "./multimodal.js";
import type { SessionFileEntry } from "./session-files.js"; import type { SessionFileEntry } from "./session-files.js";
import { import {
buildSessionEntry, buildSessionEntry,
@@ -50,6 +52,7 @@ type MemoryIndexMeta = {
provider: string; provider: string;
providerKey?: string; providerKey?: string;
sources?: MemorySource[]; sources?: MemorySource[];
scopeHash?: string;
chunkTokens: number; chunkTokens: number;
chunkOverlap: number; chunkOverlap: number;
vectorDims?: number; vectorDims?: number;
@@ -383,9 +386,22 @@ export abstract class MemoryManagerSyncOps {
} }
if (stat.isDirectory()) { if (stat.isDirectory()) {
watchPaths.add(path.join(entry, "**", "*.md")); watchPaths.add(path.join(entry, "**", "*.md"));
if (this.settings.multimodal.enabled) {
for (const modality of this.settings.multimodal.modalities) {
const pattern =
modality === "image"
? "*.{jpg,jpeg,png,webp,gif,heic,heif}"
: "*.{mp3,wav,ogg,opus,m4a,aac,flac}";
watchPaths.add(path.join(entry, "**", pattern));
}
}
continue; continue;
} }
if (stat.isFile() && entry.toLowerCase().endsWith(".md")) { if (
stat.isFile() &&
(entry.toLowerCase().endsWith(".md") ||
classifyMemoryMultimodalPath(entry, this.settings.multimodal) !== null)
) {
watchPaths.add(entry); watchPaths.add(entry);
} }
} catch { } catch {
@@ -649,9 +665,17 @@ export abstract class MemoryManagerSyncOps {
return; return;
} }
const files = await listMemoryFiles(this.workspaceDir, this.settings.extraPaths); const files = await listMemoryFiles(
this.workspaceDir,
this.settings.extraPaths,
this.settings.multimodal,
);
const fileEntries = ( const fileEntries = (
await Promise.all(files.map(async (file) => buildFileEntry(file, this.workspaceDir))) await Promise.all(
files.map(async (file) =>
buildFileEntry(file, this.workspaceDir, this.settings.multimodal),
),
)
).filter((entry): entry is MemoryFileEntry => entry !== null); ).filter((entry): entry is MemoryFileEntry => entry !== null);
log.debug("memory sync: indexing memory files", { log.debug("memory sync: indexing memory files", {
files: fileEntries.length, files: fileEntries.length,
@@ -868,6 +892,7 @@ export abstract class MemoryManagerSyncOps {
const vectorReady = await this.ensureVectorReady(); const vectorReady = await this.ensureVectorReady();
const meta = this.readMeta(); const meta = this.readMeta();
const configuredSources = this.resolveConfiguredSourcesForMeta(); const configuredSources = this.resolveConfiguredSourcesForMeta();
const configuredScopeHash = this.resolveConfiguredScopeHash();
const needsFullReindex = const needsFullReindex =
params?.force || params?.force ||
!meta || !meta ||
@@ -875,6 +900,7 @@ export abstract class MemoryManagerSyncOps {
(this.provider && meta.provider !== this.provider.id) || (this.provider && meta.provider !== this.provider.id) ||
meta.providerKey !== this.providerKey || meta.providerKey !== this.providerKey ||
this.metaSourcesDiffer(meta, configuredSources) || this.metaSourcesDiffer(meta, configuredSources) ||
meta.scopeHash !== configuredScopeHash ||
meta.chunkTokens !== this.settings.chunking.tokens || meta.chunkTokens !== this.settings.chunking.tokens ||
meta.chunkOverlap !== this.settings.chunking.overlap || meta.chunkOverlap !== this.settings.chunking.overlap ||
(vectorReady && !meta?.vectorDims); (vectorReady && !meta?.vectorDims);
@@ -1088,6 +1114,7 @@ export abstract class MemoryManagerSyncOps {
provider: this.provider?.id ?? "none", provider: this.provider?.id ?? "none",
providerKey: this.providerKey!, providerKey: this.providerKey!,
sources: this.resolveConfiguredSourcesForMeta(), sources: this.resolveConfiguredSourcesForMeta(),
scopeHash: this.resolveConfiguredScopeHash(),
chunkTokens: this.settings.chunking.tokens, chunkTokens: this.settings.chunking.tokens,
chunkOverlap: this.settings.chunking.overlap, chunkOverlap: this.settings.chunking.overlap,
}; };
@@ -1159,6 +1186,7 @@ export abstract class MemoryManagerSyncOps {
provider: this.provider?.id ?? "none", provider: this.provider?.id ?? "none",
providerKey: this.providerKey!, providerKey: this.providerKey!,
sources: this.resolveConfiguredSourcesForMeta(), sources: this.resolveConfiguredSourcesForMeta(),
scopeHash: this.resolveConfiguredScopeHash(),
chunkTokens: this.settings.chunking.tokens, chunkTokens: this.settings.chunking.tokens,
chunkOverlap: this.settings.chunking.overlap, chunkOverlap: this.settings.chunking.overlap,
}; };
@@ -1236,6 +1264,22 @@ export abstract class MemoryManagerSyncOps {
return normalized.length > 0 ? normalized : ["memory"]; return normalized.length > 0 ? normalized : ["memory"];
} }
private resolveConfiguredScopeHash(): string {
const extraPaths = normalizeExtraMemoryPaths(this.workspaceDir, this.settings.extraPaths)
.map((value) => value.replace(/\\/g, "/"))
.toSorted();
return hashText(
JSON.stringify({
extraPaths,
multimodal: {
enabled: this.settings.multimodal.enabled,
modalities: [...this.settings.multimodal.modalities].toSorted(),
maxFileBytes: this.settings.multimodal.maxFileBytes,
},
}),
);
}
private metaSourcesDiffer(meta: MemoryIndexMeta, configuredSources: MemorySource[]): boolean { private metaSourcesDiffer(meta: MemoryIndexMeta, configuredSources: MemorySource[]): boolean {
const metaSources = this.normalizeMetaSources(meta); const metaSources = this.normalizeMetaSources(meta);
if (metaSources.length !== configuredSources.length) { if (metaSources.length !== configuredSources.length) {

88
src/memory/multimodal.ts Normal file
View File

@@ -0,0 +1,88 @@
export const MEMORY_MULTIMODAL_MODALITIES = ["image", "audio"] as const;
export type MemoryMultimodalModality = (typeof MEMORY_MULTIMODAL_MODALITIES)[number];
export type MemoryMultimodalSelection = MemoryMultimodalModality | "all";
export type MemoryMultimodalSettings = {
enabled: boolean;
modalities: MemoryMultimodalModality[];
maxFileBytes: number;
};
export const DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES = 10 * 1024 * 1024;
const IMAGE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".webp", ".gif", ".heic", ".heif"]);
const AUDIO_EXTENSIONS = new Set([".mp3", ".wav", ".ogg", ".opus", ".m4a", ".aac", ".flac"]);
export function normalizeMemoryMultimodalModalities(
raw: MemoryMultimodalSelection[] | undefined,
): MemoryMultimodalModality[] {
if (!raw?.length || raw.includes("all")) {
return [...MEMORY_MULTIMODAL_MODALITIES];
}
const normalized = new Set<MemoryMultimodalModality>();
for (const value of raw) {
if (value === "image" || value === "audio") {
normalized.add(value);
}
}
return Array.from(normalized);
}
export function normalizeMemoryMultimodalSettings(raw: {
enabled?: boolean;
modalities?: MemoryMultimodalSelection[];
maxFileBytes?: number;
}): MemoryMultimodalSettings {
const enabled = raw.enabled === true;
const maxFileBytes =
typeof raw.maxFileBytes === "number" && Number.isFinite(raw.maxFileBytes)
? Math.max(1, Math.floor(raw.maxFileBytes))
: DEFAULT_MEMORY_MULTIMODAL_MAX_FILE_BYTES;
return {
enabled,
modalities: enabled ? normalizeMemoryMultimodalModalities(raw.modalities) : [],
maxFileBytes,
};
}
export function isMemoryMultimodalEnabled(settings: MemoryMultimodalSettings): boolean {
return settings.enabled && settings.modalities.length > 0;
}
export function classifyMemoryMultimodalPath(
filePath: string,
settings: MemoryMultimodalSettings,
): MemoryMultimodalModality | null {
if (!isMemoryMultimodalEnabled(settings)) {
return null;
}
const lower = filePath.trim().toLowerCase();
for (const modality of settings.modalities) {
const extensionSet = modality === "image" ? IMAGE_EXTENSIONS : AUDIO_EXTENSIONS;
for (const extension of extensionSet) {
if (lower.endsWith(extension)) {
return modality;
}
}
}
return null;
}
export function normalizeGeminiEmbeddingModelForMemory(model: string): string {
const trimmed = model.trim();
if (!trimmed) {
return "";
}
return trimmed.replace(/^models\//, "").replace(/^(gemini|google)\//, "");
}
export function supportsMemoryMultimodalEmbeddings(params: {
provider: string;
model: string;
}): boolean {
if (params.provider !== "gemini") {
return false;
}
return normalizeGeminiEmbeddingModelForMemory(params.model) === "gemini-embedding-2-preview";
}