feat: add inbound media understanding

Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
This commit is contained in:
Peter Steinberger
2026-01-17 03:52:37 +00:00
parent 4b749f1b8f
commit 1b973f7506
42 changed files with 2547 additions and 101 deletions

View File

@@ -0,0 +1,258 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { beforeEach, describe, expect, it, vi } from "vitest";
import type { ClawdbotConfig } from "../config/config.js";
import type { MsgContext } from "../auto-reply/templating.js";
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
import { fetchRemoteMedia } from "../media/fetch.js";
vi.mock("../agents/model-auth.js", () => ({
resolveApiKeyForProvider: vi.fn(async () => ({
apiKey: "test-key",
source: "test",
})),
}));
vi.mock("../media/fetch.js", () => ({
fetchRemoteMedia: vi.fn(),
}));
vi.mock("../process/exec.js", () => ({
runExec: vi.fn(),
}));
async function loadApply() {
return await import("./apply.js");
}
describe("applyMediaUnderstanding", () => {
const mockedResolveApiKey = vi.mocked(resolveApiKeyForProvider);
const mockedFetchRemoteMedia = vi.mocked(fetchRemoteMedia);
beforeEach(() => {
mockedResolveApiKey.mockClear();
mockedFetchRemoteMedia.mockReset();
mockedFetchRemoteMedia.mockResolvedValue({
buffer: Buffer.from("audio-bytes"),
contentType: "audio/ogg",
fileName: "note.ogg",
});
});
it("sets Transcript and replaces Body when audio transcription succeeds", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const audioPath = path.join(dir, "note.ogg");
await fs.writeFile(audioPath, "hello");
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPath: audioPath,
MediaType: "audio/ogg",
};
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 1024 * 1024,
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: {
id: "groq",
transcribeAudio: async () => ({ text: "transcribed text" }),
},
},
});
expect(result.appliedAudio).toBe(true);
expect(ctx.Transcript).toBe("transcribed text");
expect(ctx.Body).toBe("[Audio]\nTranscript:\ntranscribed text");
expect(ctx.CommandBody).toBe("transcribed text");
expect(ctx.RawBody).toBe("transcribed text");
});
it("handles URL-only attachments for audio transcription", async () => {
const { applyMediaUnderstanding } = await loadApply();
const ctx: MsgContext = {
Body: "<media:audio>",
MediaUrl: "https://example.com/note.ogg",
MediaType: "audio/ogg",
ChatType: "dm",
};
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 1024 * 1024,
scope: {
default: "deny",
rules: [{ action: "allow", match: { chatType: "direct" } }],
},
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: {
id: "groq",
transcribeAudio: async () => ({ text: "remote transcript" }),
},
},
});
expect(result.appliedAudio).toBe(true);
expect(ctx.Transcript).toBe("remote transcript");
expect(ctx.Body).toBe("[Audio]\nTranscript:\nremote transcript");
});
it("skips audio transcription when attachment exceeds maxBytes", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const audioPath = path.join(dir, "large.wav");
await fs.writeFile(audioPath, "0123456789");
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPath: audioPath,
MediaType: "audio/wav",
};
const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 4,
models: [{ provider: "groq" }],
},
},
},
};
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: { groq: { id: "groq", transcribeAudio } },
});
expect(result.appliedAudio).toBe(false);
expect(transcribeAudio).not.toHaveBeenCalled();
expect(ctx.Body).toBe("<media:audio>");
});
it("falls back to CLI model when provider fails", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const audioPath = path.join(dir, "note.ogg");
await fs.writeFile(audioPath, "hello");
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPath: audioPath,
MediaType: "audio/ogg",
};
const cfg: ClawdbotConfig = {
tools: {
media: {
audio: {
enabled: true,
models: [
{ provider: "groq" },
{
type: "cli",
command: "whisper",
args: ["{{MediaPath}}"],
},
],
},
},
},
};
const execModule = await import("../process/exec.js");
vi.mocked(execModule.runExec).mockResolvedValue({
stdout: "cli transcript\n",
stderr: "",
});
const result = await applyMediaUnderstanding({
ctx,
cfg,
providers: {
groq: {
id: "groq",
transcribeAudio: async () => {
throw new Error("boom");
},
},
},
});
expect(result.appliedAudio).toBe(true);
expect(ctx.Transcript).toBe("cli transcript");
expect(ctx.Body).toBe("[Audio]\nTranscript:\ncli transcript");
});
it("uses CLI image understanding and preserves caption for commands", async () => {
const { applyMediaUnderstanding } = await loadApply();
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
const imagePath = path.join(dir, "photo.jpg");
await fs.writeFile(imagePath, "image-bytes");
const ctx: MsgContext = {
Body: "<media:image> show Dom",
MediaPath: imagePath,
MediaType: "image/jpeg",
};
const cfg: ClawdbotConfig = {
tools: {
media: {
image: {
enabled: true,
models: [
{
type: "cli",
command: "gemini",
args: ["--file", "{{MediaPath}}", "--prompt", "{{Prompt}}"],
},
],
},
},
},
};
const execModule = await import("../process/exec.js");
vi.mocked(execModule.runExec).mockResolvedValue({
stdout: "image description\n",
stderr: "",
});
const result = await applyMediaUnderstanding({
ctx,
cfg,
});
expect(result.appliedImage).toBe(true);
expect(ctx.Body).toBe("[Image]\nUser text:\nshow Dom\nDescription:\nimage description");
expect(ctx.CommandBody).toBe("show Dom");
expect(ctx.RawBody).toBe("show Dom");
});
});