feat: add inbound media understanding

Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
2026-05-10 05:42:43 +00:00 · 2026-01-17 03:52:37 +00:00
parent 4b749f1b8f
commit 1b973f7506
42 changed files with 2547 additions and 101 deletions
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -0,0 +1,258 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { beforeEach, describe, expect, it, vi } from "vitest";
+
+import type { ClawdbotConfig } from "../config/config.js";
+import type { MsgContext } from "../auto-reply/templating.js";
+import { resolveApiKeyForProvider } from "../agents/model-auth.js";
+import { fetchRemoteMedia } from "../media/fetch.js";
+
+vi.mock("../agents/model-auth.js", () => ({
+  resolveApiKeyForProvider: vi.fn(async () => ({
+    apiKey: "test-key",
+    source: "test",
+  })),
+}));
+
+vi.mock("../media/fetch.js", () => ({
+  fetchRemoteMedia: vi.fn(),
+}));
+
+vi.mock("../process/exec.js", () => ({
+  runExec: vi.fn(),
+}));
+
+async function loadApply() {
+  return await import("./apply.js");
+}
+
+describe("applyMediaUnderstanding", () => {
+  const mockedResolveApiKey = vi.mocked(resolveApiKeyForProvider);
+  const mockedFetchRemoteMedia = vi.mocked(fetchRemoteMedia);
+
+  beforeEach(() => {
+    mockedResolveApiKey.mockClear();
+    mockedFetchRemoteMedia.mockReset();
+    mockedFetchRemoteMedia.mockResolvedValue({
+      buffer: Buffer.from("audio-bytes"),
+      contentType: "audio/ogg",
+      fileName: "note.ogg",
+    });
+  });
+
+  it("sets Transcript and replaces Body when audio transcription succeeds", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const audioPath = path.join(dir, "note.ogg");
+    await fs.writeFile(audioPath, "hello");
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: audioPath,
+      MediaType: "audio/ogg",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async () => ({ text: "transcribed text" }),
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("transcribed text");
+    expect(ctx.Body).toBe("[Audio]\nTranscript:\ntranscribed text");
+    expect(ctx.CommandBody).toBe("transcribed text");
+    expect(ctx.RawBody).toBe("transcribed text");
+  });
+
+  it("handles URL-only attachments for audio transcription", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaUrl: "https://example.com/note.ogg",
+      MediaType: "audio/ogg",
+      ChatType: "dm",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            scope: {
+              default: "deny",
+              rules: [{ action: "allow", match: { chatType: "direct" } }],
+            },
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async () => ({ text: "remote transcript" }),
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("remote transcript");
+    expect(ctx.Body).toBe("[Audio]\nTranscript:\nremote transcript");
+  });
+
+  it("skips audio transcription when attachment exceeds maxBytes", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const audioPath = path.join(dir, "large.wav");
+    await fs.writeFile(audioPath, "0123456789");
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: audioPath,
+      MediaType: "audio/wav",
+    };
+    const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 4,
+            models: [{ provider: "groq" }],
+          },
+        },
+      },
+    };
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: { groq: { id: "groq", transcribeAudio } },
+    });
+
+    expect(result.appliedAudio).toBe(false);
+    expect(transcribeAudio).not.toHaveBeenCalled();
+    expect(ctx.Body).toBe("<media:audio>");
+  });
+
+  it("falls back to CLI model when provider fails", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const audioPath = path.join(dir, "note.ogg");
+    await fs.writeFile(audioPath, "hello");
+
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: audioPath,
+      MediaType: "audio/ogg",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            models: [
+              { provider: "groq" },
+              {
+                type: "cli",
+                command: "whisper",
+                args: ["{{MediaPath}}"],
+              },
+            ],
+          },
+        },
+      },
+    };
+
+    const execModule = await import("../process/exec.js");
+    vi.mocked(execModule.runExec).mockResolvedValue({
+      stdout: "cli transcript\n",
+      stderr: "",
+    });
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+      providers: {
+        groq: {
+          id: "groq",
+          transcribeAudio: async () => {
+            throw new Error("boom");
+          },
+        },
+      },
+    });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("cli transcript");
+    expect(ctx.Body).toBe("[Audio]\nTranscript:\ncli transcript");
+  });
+
+  it("uses CLI image understanding and preserves caption for commands", async () => {
+    const { applyMediaUnderstanding } = await loadApply();
+    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
+    const imagePath = path.join(dir, "photo.jpg");
+    await fs.writeFile(imagePath, "image-bytes");
+
+    const ctx: MsgContext = {
+      Body: "<media:image> show Dom",
+      MediaPath: imagePath,
+      MediaType: "image/jpeg",
+    };
+    const cfg: ClawdbotConfig = {
+      tools: {
+        media: {
+          image: {
+            enabled: true,
+            models: [
+              {
+                type: "cli",
+                command: "gemini",
+                args: ["--file", "{{MediaPath}}", "--prompt", "{{Prompt}}"],
+              },
+            ],
+          },
+        },
+      },
+    };
+
+    const execModule = await import("../process/exec.js");
+    vi.mocked(execModule.runExec).mockResolvedValue({
+      stdout: "image description\n",
+      stderr: "",
+    });
+
+    const result = await applyMediaUnderstanding({
+      ctx,
+      cfg,
+    });
+
+    expect(result.appliedImage).toBe(true);
+    expect(ctx.Body).toBe("[Image]\nUser text:\nshow Dom\nDescription:\nimage description");
+    expect(ctx.CommandBody).toBe("show Dom");
+    expect(ctx.RawBody).toBe("show Dom");
+  });
+});