feat: add inbound media understanding

Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
2026-05-10 23:24:33 +00:00 · 2026-01-17 03:52:37 +00:00
parent 4b749f1b8f
commit 1b973f7506
42 changed files with 2547 additions and 101 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -89,6 +89,7 @@
 - Docs: add Date & Time guide and update prompt/timezone configuration docs.
 - Messages: debounce rapid inbound messages across channels with per-connector overrides. (#971) — thanks @juanpablodlc.
 - Messages: allow media-only sends (CLI/tool) and show Telegram voice recording status for voice notes. (#957) — thanks @rdev.
 - Media: add optional inbound media understanding for image/audio/video with provider + CLI fallbacks. (#1005) — thanks @tristanmanchester.
 - Auth/Status: keep auth profiles sticky per session (rotate on compaction/new), surface provider usage headers in `/status` and `clawdbot models status`, and update docs.
 - CLI: add `--json` output for `clawdbot daemon` lifecycle/install commands.
 - Memory: make `node-llama-cpp` an optional dependency (avoid Node 25 install failures) and improve local-embeddings fallback/errors.
--- a/docs/gateway/configuration-examples.md
+++ b/docs/gateway/configuration-examples.md
@@ -124,10 +124,21 @@ Save to `~/.clawdbot/clawdbot.json` and you can DM the bot from that number.
  // Tooling
  tools: {
-    audio: {
+    media: {
-      transcription: {
+      audio: {
-        args: ["--model", "base", "{{MediaPath}}"],
+        enabled: true,
        maxBytes: 20971520,
        models: [
          { provider: "openai", model: "whisper-1" },
          // Optional CLI fallback (Whisper binary):
          // { type: "cli", command: "whisper", args: ["--model", "base", "{{MediaPath}}"] }
        ],
        timeoutSeconds: 120
      },
      video: {
        enabled: true,
        maxBytes: 52428800,
        models: [{ provider: "google", model: "gemini-3-flash-preview" }]
      }
    }
  },
--- a/docs/gateway/configuration.md
+++ b/docs/gateway/configuration.md
@@ -1769,6 +1769,58 @@ Legacy: `tools.bash` is still accepted as an alias.
 - `tools.web.fetch.firecrawl.maxAgeMs` (optional)
 - `tools.web.fetch.firecrawl.timeoutSeconds` (optional)
 `tools.media` configures inbound media understanding (image/audio/video):
 - `tools.media.image` / `tools.media.audio` / `tools.media.video`:
  - `enabled`: opt-out switch (default true).
  - `prompt`: optional prompt override (image/video append a `maxChars` hint automatically).
  - `maxChars`: max output characters (default 500 for image/video; unset for audio).
  - `maxBytes`: max media size to send (defaults: image 10MB, audio 20MB, video 50MB).
  - `timeoutSeconds`: request timeout (defaults: image 60s, audio 60s, video 120s).
  - `language`: optional audio hint.
  - `scope`: optional gating (first match wins) with `match.channel`, `match.chatType`, or `match.keyPrefix`.
  - `models`: ordered list of model entries; failures or oversize media fall back to the next entry.
 - Each `models[]` entry:
  - Provider entry (`type: "provider"` or omitted):
    - `provider`: API provider id (`openai`, `anthropic`, `google`/`gemini`, `groq`, etc).
    - `model`: model id override (required for image; defaults to `whisper-1`/`whisper-large-v3-turbo` for audio providers, and `gemini-3-flash-preview` for video).
    - `profile` / `preferredProfile`: auth profile selection.
  - CLI entry (`type: "cli"`):
    - `command`: executable to run.
    - `args`: templated args (supports `{{MediaPath}}`, `{{Prompt}}`, `{{MaxChars}}`, etc).
  - `capabilities`: optional list (`image`, `audio`, `video`) to gate a shared entry.
  - `prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language` can be overridden per entry.
 If no models are configured (or `enabled: false`), understanding is skipped; the model still receives the original attachments.
 Provider auth follows the standard model auth order (auth profiles, env vars like `OPENAI_API_KEY`/`GROQ_API_KEY`/`GEMINI_API_KEY`, or `models.providers.*.apiKey`).
 Example:
 ```json5
 {
  tools: {
    media: {
      audio: {
        enabled: true,
        maxBytes: 20971520,
        scope: {
          default: "deny",
          rules: [{ action: "allow", match: { chatType: "direct" } }]
        },
        models: [
          { provider: "openai", model: "whisper-1" },
          { type: "cli", command: "whisper", args: ["--model", "base", "{{MediaPath}}"] }
        ]
      },
      video: {
        enabled: true,
        maxBytes: 52428800,
        models: [{ provider: "google", model: "gemini-3-flash-preview" }]
      }
    }
  }
 }
 ```
 `agents.defaults.subagents` configures sub-agent defaults:
 - `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the caller’s model unless overridden per agent or per call.
 - `maxConcurrent`: max concurrent sub-agent runs (default 1)
@@ -2848,7 +2900,7 @@ clawdbot dns setup --apply
 ## Template variables
-Template placeholders are expanded in `tools.audio.transcription.args` (and any future templated argument fields).
+Template placeholders are expanded in `tools.media.*.models[].args` (and any future templated argument fields).
 | Variable | Description |
 |----------|-------------|
@@ -2864,6 +2916,8 @@ Template placeholders are expanded in `tools.audio.transcription.args` (and any
 | `{{MediaPath}}` | Local media path (if downloaded) |
 | `{{MediaType}}` | Media type (image/audio/document/…) |
 | `{{Transcript}}` | Audio transcript (when enabled) |
 | `{{Prompt}}` | Resolved media prompt for CLI entries |
 | `{{MaxChars}}` | Resolved max output chars for CLI entries |
 | `{{ChatType}}` | `"direct"` or `"group"` |
 | `{{GroupSubject}}` | Group subject (best effort) |
 | `{{GroupMembers}}` | Group members preview (best effort) |
--- a/docs/gateway/doctor.md
+++ b/docs/gateway/doctor.md
@@ -111,7 +111,7 @@ Current migrations:
 - `routing.bindings` → top-level `bindings`
 - `routing.agents`/`routing.defaultAgentId` → `agents.list` + `agents.list[].default`
 - `routing.agentToAgent` → `tools.agentToAgent`
- `routing.transcribeAudio` → `tools.audio.transcription`
+- `routing.transcribeAudio` → `tools.media.audio.models`
 - `bindings[].match.accountID` → `bindings[].match.accountId`
 - `identity` → `agents.list[].identity`
 - `agent.*` → `agents.defaults` + `tools.*` (tools/elevated/exec/sandbox/subagents)
--- a/docs/nodes/audio.md
+++ b/docs/nodes/audio.md
@@ -3,25 +3,59 @@ summary: "How inbound audio/voice notes are downloaded, transcribed, and injecte
 read_when:
  - Changing audio transcription or media handling
 ---
-# Audio / Voice Notes — 2025-12-05
+# Audio / Voice Notes — 2026-01-17
 ## What works
- **Optional transcription**: If `tools.audio.transcription` is set in `~/.clawdbot/clawdbot.json`, Clawdbot will:
+- **Media understanding (audio)**: If `tools.media.audio` is enabled and has `models`, Clawdbot:
-  1) Download inbound audio to a temp path when WhatsApp only provides a URL.
+  1) Locates the first audio attachment (local path or URL) and downloads it if needed.
-  2) Run the configured CLI args (templated with `{{MediaPath}}`), expecting transcript on stdout.
+  2) Enforces `maxBytes` before sending to each model entry.
-  3) Replace `Body` with the transcript, set `{{Transcript}}`, and prepend the original media path plus a `Transcript:` section in the command prompt so models see both.
+  3) Runs the first eligible model entry in order (provider or CLI).
-  4) Continue through the normal auto-reply pipeline (templating, sessions, Pi command).
+  4) If it fails or skips (size/timeout), it tries the next entry.
- **Verbose logging**: In `--verbose`, we log when transcription runs and when the transcript replaces the body.
+  5) On success, it replaces `Body` with an `[Audio]` block and sets `{{Transcript}}`.
 - **Command parsing**: When transcription succeeds, `CommandBody`/`RawBody` are set to the transcript so slash commands still work.
 - **Verbose logging**: In `--verbose`, we log when transcription runs and when it replaces the body.
-## Config example (Whisper CLI)
+## Config examples
-Requires `whisper` CLI installed:
+
 ### Provider + CLI fallback (OpenAI + Whisper CLI)
 ```json5
 {
  tools: {
-    audio: {
+    media: {
-      transcription: {
+      audio: {
-        args: ["--model", "base", "{{MediaPath}}"],
+        enabled: true,
-        timeoutSeconds: 45
+        maxBytes: 20971520,
        models: [
          { provider: "openai", model: "whisper-1" },
          {
            type: "cli",
            command: "whisper",
            args: ["--model", "base", "{{MediaPath}}"],
            timeoutSeconds: 45
          }
        ]
      }
    }
  }
 }
 ```
 ### Provider-only with scope gating
 ```json5
 {
  tools: {
    media: {
      audio: {
        enabled: true,
        scope: {
          default: "allow",
          rules: [
            { action: "deny", match: { chatType: "group" } }
          ]
        },
        models: [
          { provider: "openai", model: "whisper-1" }
        ]
      }
    }
  }
@@ -29,12 +63,13 @@ Requires `whisper` CLI installed:
 ```
 ## Notes & limits
- We don’t ship a transcriber; you opt in with the Whisper CLI on your PATH.
+- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
- Size guard: inbound audio must be ≤5 MB (matches the temp media store and transcript pipeline).
+- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
- Outbound caps: web send supports audio/voice up to 16 MB (sent as a voice note with `ptt: true`).
+- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
- If transcription fails, we fall back to the original body/media note; replies still go through.
+- Transcript is available to templates as `{{Transcript}}`.
- Transcript is available to templates as `{{Transcript}}`; models get both the media path and a `Transcript:` block in the prompt when using command mode.
+- CLI stdout is capped (5MB); keep CLI output concise.
 ## Gotchas
 - Scope rules use first-match wins. `chatType` is normalized to `direct`, `group`, or `room`.
 - Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`.
- Keep timeouts reasonable (`timeoutSeconds`, default 45s) to avoid blocking the reply queue.
+- Keep timeouts reasonable (`timeoutSeconds`, default 60s) to avoid blocking the reply queue.
--- a/docs/nodes/images.md
+++ b/docs/nodes/images.md
@@ -38,13 +38,23 @@ The WhatsApp channel runs via **Baileys Web**. This document captures the curren
  - `{{MediaUrl}}` pseudo-URL for the inbound media.
  - `{{MediaPath}}` local temp path written before running the command.
 - When a per-session Docker sandbox is enabled, inbound media is copied into the sandbox workspace and `MediaPath`/`MediaUrl` are rewritten to a relative path like `media/inbound/<filename>`.
- Audio transcription (if configured via `tools.audio.transcription`) runs before templating and can replace `Body` with the transcript.
+- Media understanding (if configured via `tools.media.*`) runs before templating and can insert `[Image]`, `[Audio]`, and `[Video]` blocks into `Body`.
  - Audio sets `{{Transcript}}` and uses the transcript for command parsing so slash commands still work.
  - Video and image descriptions preserve any caption text for command parsing.
 - Only the first matching image/audio/video attachment is processed; remaining attachments are left untouched.
 ## Limits & Errors
 **Outbound send caps (WhatsApp web send)**
 - Images: ~6 MB cap after recompression.
 - Audio/voice/video: 16 MB cap; documents: 100 MB cap.
 - Oversize or unreadable media → clear error in logs and the reply is skipped.
 **Media understanding caps (transcription/description)**
 - Image default: 10 MB (`tools.media.image.maxBytes`).
 - Audio default: 20 MB (`tools.media.audio.maxBytes`).
 - Video default: 50 MB (`tools.media.video.maxBytes`).
 - Oversize media skips understanding, but replies still go through with the original body.
 ## Notes for Tests
 - Cover send + reply flows for image/audio/document cases.
 - Validate recompression for images (size bound) and voice-note flag for audio.
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -0,0 +1,217 @@
 ---
 summary: "Inbound image/audio/video understanding (optional) with provider + CLI fallbacks"
 read_when:
  - Designing or refactoring media understanding
  - Tuning inbound audio/video/image preprocessing
 ---
 # Media Understanding (Inbound) — 2026-01-17
 Clawdbot can optionally **summarize inbound media** (image/audio/video) before the reply pipeline runs. This is **opt-in** and separate from the base attachment flow—if understanding is off, models still receive the original files/URLs as usual.
 ## Goals
 - Optional: pre‑digest inbound media into short text for faster routing + better command parsing.
 - Preserve original media delivery to the model (always).
 - Support **provider APIs** and **CLI fallbacks**.
 - Allow multiple models with ordered fallback (error/size/timeout).
 ## High‑level behavior
 1) Collect inbound attachments (`MediaPaths`, `MediaUrls`, `MediaTypes`).
 2) For each enabled capability (image/audio/video), pick the **first matching attachment**.
 3) Choose the first eligible model entry (size + capability + auth).  
 4) If a model fails or the media is too large, **fall back to the next entry**.
 5) On success:
   - `Body` becomes `[Image]`, `[Audio]`, or `[Video]` block.
   - Audio sets `{{Transcript}}` and `CommandBody`/`RawBody` for command parsing.
   - Captions are preserved as `User text:` inside the block.
 If understanding fails or is disabled, **the reply flow continues** with the original body + attachments.
 ## Config overview
 Use **per‑capability configs** under `tools.media`. Each capability can define:
 - defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
 - **ordered `models` list** (fallback order)
 - `scope` (optional gating by channel/chatType/session key)
 ```json5
 {
  tools: {
    media: {
      image: { /* config */ },
      audio: { /* config */ },
      video: { /* config */ }
    }
  }
 }
 ```
 ### Model entries
 Each `models[]` entry can be **provider** or **CLI**:
 ```json5
 {
  type: "provider",        // default if omitted
  provider: "openai",
  model: "gpt-5.2",
  prompt: "Describe the image in <= 500 chars.",
  maxChars: 500,
  maxBytes: 10485760,
  timeoutSeconds: 60,
  capabilities: ["image"], // optional, used for multi‑modal entries
  profile: "vision-profile",
  preferredProfile: "vision-fallback"
 }
 ```
 ```json5
 {
  type: "cli",
  command: "gemini",
  args: [
    "-m",
    "gemini-3-flash",
    "--allowed-tools",
    "read_file",
    "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters."
  ],
  maxChars: 500,
  maxBytes: 52428800,
  timeoutSeconds: 120,
  capabilities: ["video", "image"]
 }
 ```
 ## Defaults and limits
 Recommended defaults:
 - `maxChars`: **500** for image/video (short, command‑friendly)
 - `maxChars`: **unset** for audio (full transcript unless you set a limit)
 - `maxBytes`:
  - image: **10MB**
  - audio: **20MB**
  - video: **50MB**
 Rules:
 - If media exceeds `maxBytes`, that model is skipped and the **next model is tried**.
 - If the model returns more than `maxChars`, output is trimmed.
 - `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
 ## Capabilities (optional)
 If you set `capabilities`, the entry only runs for those media types. Suggested
 defaults when you opt in:
 - `openai`, `anthropic`: **image**
 - `google` (Gemini API): **image + audio + video**
 - CLI entries: declare the exact capabilities you support.
 If you omit `capabilities`, the entry is eligible for the list it appears in.
 ## Provider support matrix (Clawdbot integrations)
 | Capability | Provider integration | Notes |
 |------------|----------------------|-------|
 | Image | OpenAI / Anthropic / Google / others via `pi-ai` | Any image-capable model in the registry works. |
 | Audio | OpenAI, Groq | Provider transcription (Whisper). |
 | Video | Google (Gemini API) | Provider video understanding. |
 ## Recommended providers
 **Image**
 - Prefer your active model if it supports images.
 - Good defaults: `openai/gpt-5.2`, `anthropic/claude-opus-4-5`, `google/gemini-3-pro-preview`.
 **Audio**
 - `openai/whisper-1` or `groq/whisper-large-v3-turbo`.
 - CLI fallback: `whisper` binary.
 **Video**
 - `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer).
 - CLI fallback: `gemini` CLI (supports `read_file` on video/audio).
 ## Config examples
 ### 1) Audio + Video only (image off)
 ```json5
 {
  tools: {
    media: {
      audio: {
        enabled: true,
        models: [
          { provider: "openai", model: "whisper-1" },
          {
            type: "cli",
            command: "whisper",
            args: ["--model", "base", "{{MediaPath}}"]
          }
        ]
      },
      video: {
        enabled: true,
        maxChars: 500,
        models: [
          { provider: "google", model: "gemini-3-flash-preview" },
          {
            type: "cli",
            command: "gemini",
            args: [
              "-m",
              "gemini-3-flash",
              "--allowed-tools",
              "read_file",
              "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters."
            ]
          }
        ]
      }
    }
  }
 }
 ```
 ### 2) Optional image understanding
 ```json5
 {
  tools: {
    media: {
      image: {
        enabled: true,
        maxBytes: 10485760,
        maxChars: 500,
        models: [
          { provider: "openai", model: "gpt-5.2" },
          { provider: "anthropic", model: "claude-opus-4-5" },
          {
            type: "cli",
            command: "gemini",
            args: [
              "-m",
              "gemini-3-flash",
              "--allowed-tools",
              "read_file",
              "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters."
            ]
          }
        ]
      }
    }
  }
 }
 ```
 ### 3) Multi‑modal single entry (explicit capabilities)
 ```json5
 {
  tools: {
    media: {
      image: { models: [{ provider: "google", model: "gemini-3-pro-preview", capabilities: ["image", "video", "audio"] }] },
      audio: { models: [{ provider: "google", model: "gemini-3-pro-preview", capabilities: ["image", "video", "audio"] }] },
      video: { models: [{ provider: "google", model: "gemini-3-pro-preview", capabilities: ["image", "video", "audio"] }] }
    }
  }
 }
 ```
 ## Notes
 - Understanding is **best‑effort**. Errors do not block replies.
 - Attachments are still passed to models even when understanding is disabled.
 - Use `scope` to limit where understanding runs (e.g. only DMs).
 ## Related docs
 - [Configuration](/gateway/configuration)
 - [Image & Media Support](/nodes/images)
--- a/src/agents/bash-tools.exec.ts
+++ b/src/agents/bash-tools.exec.ts
@@ -254,7 +254,10 @@ export function createExecTool(
      let yielded = false;
      let yieldTimer: NodeJS.Timeout | null = null;
      let timeoutTimer: NodeJS.Timeout | null = null;
      let timeoutFinalizeTimer: NodeJS.Timeout | null = null;
      let timedOut = false;
      const timeoutFinalizeMs = 1000;
      let rejectFn: ((err: Error) => void) | null = null;
      const settle = (fn: () => void) => {
        if (settled) return;
@@ -262,6 +265,18 @@ export function createExecTool(
        fn();
      };
      const effectiveTimeout =
        typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec;
      const finalizeTimeout = () => {
        if (session.exited) return;
        markExited(session, null, "SIGKILL", "failed");
        if (settled || !rejectFn) return;
        const aggregated = session.aggregated.trim();
        const reason = `Command timed out after ${effectiveTimeout} seconds`;
        const message = aggregated ? `${aggregated}\n\n${reason}` : reason;
        settle(() => rejectFn?.(new Error(message)));
      };
      // Tool-call abort should not kill backgrounded sessions; timeouts still must.
      const onAbortSignal = () => {
        if (yielded || session.backgrounded) return;
@@ -272,15 +287,17 @@ export function createExecTool(
      const onTimeout = () => {
        timedOut = true;
        killSession(session);
        if (!timeoutFinalizeTimer) {
          timeoutFinalizeTimer = setTimeout(() => {
            finalizeTimeout();
          }, timeoutFinalizeMs);
        }
      };
      if (signal?.aborted) onAbortSignal();
      else if (signal) {
        signal.addEventListener("abort", onAbortSignal, { once: true });
      }
      const effectiveTimeout =
        typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec;
      if (effectiveTimeout > 0) {
        timeoutTimer = setTimeout(() => {
          onTimeout();
@@ -321,6 +338,7 @@ export function createExecTool(
      });
      return new Promise<AgentToolResult<ExecToolDetails>>((resolve, reject) => {
        rejectFn = reject;
        const resolveRunning = () => {
          settle(() =>
            resolve({
@@ -369,6 +387,7 @@ export function createExecTool(
        const handleExit = (code: number | null, exitSignal: NodeJS.Signals | number | null) => {
          if (yieldTimer) clearTimeout(yieldTimer);
          if (timeoutTimer) clearTimeout(timeoutTimer);
          if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer);
          const durationMs = Date.now() - startedAt;
          const wasSignal = exitSignal != null;
          const isSuccess = code === 0 && !wasSignal && !signal?.aborted && !timedOut;
@@ -421,6 +440,7 @@ export function createExecTool(
        child.once("error", (err) => {
          if (yieldTimer) clearTimeout(yieldTimer);
          if (timeoutTimer) clearTimeout(timeoutTimer);
          if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer);
          markExited(session, null, null, "failed");
          settle(() => reject(err));
        });
--- a/src/auto-reply/media-note.test.ts
+++ b/src/auto-reply/media-note.test.ts
@@ -25,4 +25,20 @@ describe("buildInboundMediaNote", () => {
      ].join("\n"),
    );
  });
  it("skips media notes for attachments with understanding output", () => {
    const note = buildInboundMediaNote({
      MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
      MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
      MediaUnderstanding: [
        {
          kind: "audio.transcription",
          attachmentIndex: 0,
          text: "hello",
          provider: "groq",
        },
      ],
    });
    expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
  });
 });
--- a/src/auto-reply/media-note.ts
+++ b/src/auto-reply/media-note.ts
@@ -18,6 +18,12 @@ function formatMediaAttachedLine(params: {
 }
 export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
  // Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
  const suppressed = new Set(
    Array.isArray(ctx.MediaUnderstanding)
      ? ctx.MediaUnderstanding.map((output) => output.attachmentIndex)
      : [],
  );
  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
  const paths =
    pathsFromArray && pathsFromArray.length > 0
@@ -36,24 +42,33 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
      ? ctx.MediaTypes
      : undefined;
-  if (paths.length === 1) {
+  const entries = paths
    .map((entry, index) => ({
      path: entry ?? "",
      type: types?.[index] ?? ctx.MediaType,
      url: urls?.[index] ?? ctx.MediaUrl,
      index,
    }))
    .filter((entry) => !suppressed.has(entry.index));
  if (entries.length === 0) return undefined;
  if (entries.length === 1) {
    return formatMediaAttachedLine({
-      path: paths[0] ?? "",
+      path: entries[0]?.path ?? "",
-      type: types?.[0] ?? ctx.MediaType,
+      type: entries[0]?.type,
-      url: urls?.[0] ?? ctx.MediaUrl,
+      url: entries[0]?.url,
    });
  }
-  const count = paths.length;
+  const count = entries.length;
  const lines: string[] = [`[media attached: ${count} files]`];
-  for (const [idx, mediaPath] of paths.entries()) {
+  for (const [idx, entry] of entries.entries()) {
    lines.push(
      formatMediaAttachedLine({
-        path: mediaPath,
+        path: entry.path,
        index: idx + 1,
        total: count,
-        type: types?.[idx],
+        type: entry.type,
-        url: urls?.[idx],
+        url: entry.url,
      }),
    );
  }
--- a/src/auto-reply/reply/get-reply-directives.ts
+++ b/src/auto-reply/reply/get-reply-directives.ts
@@ -122,6 +122,7 @@ export async function resolveReplyDirectives(params: {
  const commandSource =
    sessionCtx.CommandBody ??
    sessionCtx.RawBody ??
    sessionCtx.Transcript ??
    sessionCtx.BodyStripped ??
    sessionCtx.Body ??
    "";
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -87,7 +87,6 @@ type RunPreparedReplyParams = {
    cap?: number;
    dropPolicy?: InlineDirectives["dropPolicy"];
  };
  transcribedText?: string;
  typing: TypingController;
  opts?: GetReplyOptions;
  defaultModel: string;
@@ -210,7 +209,6 @@ export async function runPreparedReply(
    model,
    perMessageQueueMode,
    perMessageQueueOptions,
    transcribedText,
    typing,
    opts,
    defaultModel,
@@ -325,11 +323,7 @@ export async function runPreparedReply(
  sessionEntry = skillResult.sessionEntry ?? sessionEntry;
  currentSystemSent = skillResult.systemSent;
  const skillsSnapshot = skillResult.skillsSnapshot;
-  const prefixedBody = transcribedText
+  const prefixedBody = [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
    ? [threadStarterNote, prefixedBodyBase, `Transcript:\n${transcribedText}`]
        .filter(Boolean)
        .join("\n\n")
    : [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
  const mediaNote = buildInboundMediaNote(ctx);
  const mediaReplyHint = mediaNote
    ? "To send an image back, add a line like: MEDIA:https://example.com/image.jpg (no spaces). Keep caption in the text body."
@@ -370,11 +364,7 @@ export async function runPreparedReply(
  }
  const sessionIdFinal = sessionId ?? crypto.randomUUID();
  const sessionFile = resolveSessionFilePath(sessionIdFinal, sessionEntry);
-  const queueBodyBase = transcribedText
+  const queueBodyBase = [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
    ? [threadStarterNote, baseBodyFinal, `Transcript:\n${transcribedText}`]
        .filter(Boolean)
        .join("\n\n")
    : [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
  const queuedBody = mediaNote
    ? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim()
    : queueBodyBase;
--- a/src/auto-reply/reply/get-reply.ts
+++ b/src/auto-reply/reply/get-reply.ts
@@ -7,12 +7,11 @@ import { resolveModelRefFromString } from "../../agents/model-selection.js";
 import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
 import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js";
 import { type ClawdbotConfig, loadConfig } from "../../config/config.js";
 import { logVerbose } from "../../globals.js";
 import { defaultRuntime } from "../../runtime.js";
 import { resolveCommandAuthorization } from "../command-auth.js";
 import type { MsgContext } from "../templating.js";
 import { SILENT_REPLY_TOKEN } from "../tokens.js";
-import { hasAudioTranscriptionConfig, isAudio, transcribeInboundAudio } from "../transcription.js";
+import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
 import type { GetReplyOptions, ReplyPayload } from "../types.js";
 import { resolveDefaultModel } from "./directive-handling.js";
 import { resolveReplyDirectives } from "./get-reply-directives.js";
@@ -75,16 +74,11 @@ export async function getReplyFromConfig(
  });
  opts?.onTypingController?.(typing);
-  let transcribedText: string | undefined;
+  await applyMediaUnderstanding({
-  if (hasAudioTranscriptionConfig(cfg) && isAudio(ctx.MediaType)) {
+    ctx,
-    const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
+    cfg,
-    if (transcribed?.text) {
+    agentDir,
-      transcribedText = transcribed.text;
+  });
      ctx.Body = transcribed.text;
      ctx.Transcript = transcribed.text;
      logVerbose("Replaced Body with audio transcript for reply flow");
    }
  }
  const commandAuthorized = ctx.CommandAuthorized ?? true;
  resolveCommandAuthorization({
@@ -253,7 +247,6 @@ export async function getReplyFromConfig(
    model,
    perMessageQueueMode,
    perMessageQueueOptions,
    transcribedText,
    typing,
    opts,
    defaultModel,
--- a/src/auto-reply/templating.ts
+++ b/src/auto-reply/templating.ts
@@ -1,6 +1,7 @@
 import type { ChannelId } from "../channels/plugins/types.js";
 import type { InternalMessageChannel } from "../utils/message-channel.js";
 import type { CommandArgs } from "./commands-registry.types.js";
 import type { MediaUnderstandingOutput } from "../media-understanding/types.js";
 /** Valid message channels for routing. */
 export type OriginatingChannelType = ChannelId | InternalMessageChannel;
@@ -41,6 +42,9 @@ export type MsgContext = {
  /** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */
  MediaRemoteHost?: string;
  Transcript?: string;
  MediaUnderstanding?: MediaUnderstandingOutput[];
  Prompt?: string;
  MaxChars?: number;
  ChatType?: string;
  GroupSubject?: string;
  GroupRoom?: string;
--- a/src/config/config.legacy-config-detection.rejects-routing-allowfrom.test.ts
+++ b/src/config/config.legacy-config-detection.rejects-routing-allowfrom.test.ts
@@ -65,7 +65,7 @@ describe("legacy config detection", () => {
    expect(res.config?.messages?.groupChat?.mentionPatterns).toEqual(["@clawd"]);
    expect(res.config?.routing?.groupChat?.mentionPatterns).toBeUndefined();
  });
-  it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/audio", async () => {
+  it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/media", async () => {
    vi.resetModules();
    const { migrateLegacyConfig } = await import("./config.js");
    const res = migrateLegacyConfig({
@@ -80,7 +80,7 @@ describe("legacy config detection", () => {
    });
    expect(res.changes).toContain("Moved routing.agentToAgent → tools.agentToAgent.");
    expect(res.changes).toContain("Moved routing.queue → messages.queue.");
-    expect(res.changes).toContain("Moved routing.transcribeAudio → tools.audio.transcription.");
+    expect(res.changes).toContain("Moved routing.transcribeAudio → tools.media.audio.models.");
    expect(res.config?.tools?.agentToAgent).toEqual({
      enabled: true,
      allow: ["main"],
@@ -89,9 +89,16 @@ describe("legacy config detection", () => {
      mode: "queue",
      cap: 3,
    });
-    expect(res.config?.tools?.audio?.transcription).toEqual({
+    expect(res.config?.tools?.media?.audio).toEqual({
-      args: ["--model", "base"],
+      enabled: true,
-      timeoutSeconds: 2,
+      models: [
        {
          command: "whisper",
          type: "cli",
          args: ["--model", "base"],
          timeoutSeconds: 2,
        },
      ],
    });
    expect(res.config?.routing).toBeUndefined();
  });
--- a/src/config/legacy.migrations.part-2.ts
+++ b/src/config/legacy.migrations.part-2.ts
@@ -330,14 +330,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [
        const mapped = mapLegacyAudioTranscription(routing.transcribeAudio);
        if (mapped) {
          const tools = ensureRecord(raw, "tools");
-          const toolsAudio = ensureRecord(tools, "audio");
+          const media = ensureRecord(tools, "media");
-          if (toolsAudio.transcription === undefined) {
+          const mediaAudio = ensureRecord(media, "audio");
-            toolsAudio.transcription = mapped;
+          const models = Array.isArray(mediaAudio.models)
-            changes.push("Moved routing.transcribeAudio → tools.audio.transcription.");
+            ? (mediaAudio.models as unknown[])
            : [];
          if (models.length === 0) {
            mediaAudio.enabled = true;
            mediaAudio.models = [mapped];
            changes.push("Moved routing.transcribeAudio → tools.media.audio.models.");
          } else {
-            changes.push(
+            changes.push("Removed routing.transcribeAudio (tools.media.audio.models already set).");
              "Removed routing.transcribeAudio (tools.audio.transcription already set).",
            );
          }
        } else {
          changes.push("Removed routing.transcribeAudio (unsupported transcription CLI).");
@@ -350,12 +353,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [
        const mapped = mapLegacyAudioTranscription(audio.transcription);
        if (mapped) {
          const tools = ensureRecord(raw, "tools");
-          const toolsAudio = ensureRecord(tools, "audio");
+          const media = ensureRecord(tools, "media");
-          if (toolsAudio.transcription === undefined) {
+          const mediaAudio = ensureRecord(media, "audio");
-            toolsAudio.transcription = mapped;
+          const models = Array.isArray(mediaAudio.models)
-            changes.push("Moved audio.transcription → tools.audio.transcription.");
+            ? (mediaAudio.models as unknown[])
            : [];
          if (models.length === 0) {
            mediaAudio.enabled = true;
            mediaAudio.models = [mapped];
            changes.push("Moved audio.transcription → tools.media.audio.models.");
          } else {
-            changes.push("Removed audio.transcription (tools.audio.transcription already set).");
+            changes.push("Removed audio.transcription (tools.media.audio.models already set).");
          }
          delete audio.transcription;
          if (Object.keys(audio).length === 0) delete raw.audio;
--- a/src/config/legacy.rules.ts
+++ b/src/config/legacy.rules.ts
@@ -69,7 +69,7 @@ export const LEGACY_CONFIG_RULES: LegacyConfigRule[] = [
  {
    path: ["routing", "transcribeAudio"],
    message:
-      "routing.transcribeAudio was moved; use tools.audio.transcription instead (auto-migrated on load).",
+      "routing.transcribeAudio was moved; use tools.media.audio.models instead (auto-migrated on load).",
  },
  {
    path: ["telegram", "requireMention"],
--- a/src/config/legacy.shared.ts
+++ b/src/config/legacy.shared.ts
@@ -56,7 +56,7 @@ export const mapLegacyAudioTranscription = (value: unknown): Record<string, unkn
  const timeoutSeconds =
    typeof transcriber?.timeoutSeconds === "number" ? transcriber?.timeoutSeconds : undefined;
-  const result: Record<string, unknown> = {};
+  const result: Record<string, unknown> = { command: rawExecutable, type: "cli" };
  if (args.length > 0) result.args = args;
  if (timeoutSeconds !== undefined) result.timeoutSeconds = timeoutSeconds;
  return result;
--- a/src/config/schema.ts
+++ b/src/config/schema.ts
@@ -102,8 +102,28 @@ const FIELD_LABELS: Record<string, string> = {
  "gateway.remote.password": "Remote Gateway Password",
  "gateway.auth.token": "Gateway Token",
  "gateway.auth.password": "Gateway Password",
-  "tools.audio.transcription.args": "Audio Transcription Args",
+  "tools.media.image.enabled": "Enable Image Understanding",
-  "tools.audio.transcription.timeoutSeconds": "Audio Transcription Timeout (sec)",
+  "tools.media.image.maxBytes": "Image Understanding Max Bytes",
  "tools.media.image.maxChars": "Image Understanding Max Chars",
  "tools.media.image.prompt": "Image Understanding Prompt",
  "tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)",
  "tools.media.image.models": "Image Understanding Models",
  "tools.media.image.scope": "Image Understanding Scope",
  "tools.media.audio.enabled": "Enable Audio Understanding",
  "tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
  "tools.media.audio.maxChars": "Audio Understanding Max Chars",
  "tools.media.audio.prompt": "Audio Understanding Prompt",
  "tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
  "tools.media.audio.language": "Audio Understanding Language",
  "tools.media.audio.models": "Audio Understanding Models",
  "tools.media.audio.scope": "Audio Understanding Scope",
  "tools.media.video.enabled": "Enable Video Understanding",
  "tools.media.video.maxBytes": "Video Understanding Max Bytes",
  "tools.media.video.maxChars": "Video Understanding Max Chars",
  "tools.media.video.prompt": "Video Understanding Prompt",
  "tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)",
  "tools.media.video.models": "Video Understanding Models",
  "tools.media.video.scope": "Video Understanding Scope",
  "tools.profile": "Tool Profile",
  "agents.list[].tools.profile": "Agent Tool Profile",
  "tools.byProvider": "Tool Policy by Provider",
--- a/src/config/types.messages.ts
+++ b/src/config/types.messages.ts
@@ -47,7 +47,7 @@ export type BroadcastConfig = {
 };
 export type AudioConfig = {
-  /** @deprecated Use tools.audio.transcription instead. */
+  /** @deprecated Use tools.media.audio.models instead. */
  transcription?: {
    // Optional CLI to turn inbound audio into text; templated args, must output transcript to stdout.
    command: string[];
--- a/src/config/types.tools.ts
+++ b/src/config/types.tools.ts
@@ -1,4 +1,76 @@
-import type { AgentElevatedAllowFromConfig } from "./types.base.js";
+import type { AgentElevatedAllowFromConfig, SessionSendPolicyAction } from "./types.base.js";
 export type MediaUnderstandingScopeMatch = {
  channel?: string;
  chatType?: "direct" | "group" | "room";
  keyPrefix?: string;
 };
 export type MediaUnderstandingScopeRule = {
  action: SessionSendPolicyAction;
  match?: MediaUnderstandingScopeMatch;
 };
 export type MediaUnderstandingScopeConfig = {
  default?: SessionSendPolicyAction;
  rules?: MediaUnderstandingScopeRule[];
 };
 export type MediaUnderstandingCapability = "image" | "audio" | "video";
 export type MediaUnderstandingModelConfig = {
  /** provider API id (e.g. openai, google). */
  provider?: string;
  /** Model id for provider-based understanding. */
  model?: string;
  /** Optional capability tags for shared model lists. */
  capabilities?: MediaUnderstandingCapability[];
  /** Use a CLI command instead of provider API. */
  type?: "provider" | "cli";
  /** CLI binary (required when type=cli). */
  command?: string;
  /** CLI args (template-enabled). */
  args?: string[];
  /** Optional prompt override for this model entry. */
  prompt?: string;
  /** Optional max output characters for this model entry. */
  maxChars?: number;
  /** Optional max bytes for this model entry. */
  maxBytes?: number;
  /** Optional timeout override (seconds) for this model entry. */
  timeoutSeconds?: number;
  /** Optional language hint for audio transcription. */
  language?: string;
  /** Auth profile id to use for this provider. */
  profile?: string;
  /** Preferred profile id if multiple are available. */
  preferredProfile?: string;
 };
 export type MediaUnderstandingConfig = {
  /** Enable media understanding when models are configured. */
  enabled?: boolean;
  /** Optional scope gating for understanding. */
  scope?: MediaUnderstandingScopeConfig;
  /** Default max bytes to send. */
  maxBytes?: number;
  /** Default max output characters. */
  maxChars?: number;
  /** Default prompt. */
  prompt?: string;
  /** Default timeout (seconds). */
  timeoutSeconds?: number;
  /** Default language hint (audio). */
  language?: string;
  /** Ordered model list (fallbacks in order). */
  models?: MediaUnderstandingModelConfig[];
 };
 export type MediaToolsConfig = {
  image?: MediaUnderstandingConfig;
  audio?: MediaUnderstandingConfig;
  video?: MediaUnderstandingConfig;
 };
 export type ToolProfileId = "minimal" | "coding" | "messaging" | "full";
@@ -127,13 +199,7 @@ export type ToolsConfig = {
      };
    };
  };
-  audio?: {
+  media?: MediaToolsConfig;
    transcription?: {
      /** CLI args (template-enabled). */
      args?: string[];
      timeoutSeconds?: number;
    };
  };
  /** Message tool configuration. */
  message?: {
    /**
--- a/src/config/zod-schema.agent-runtime.ts
+++ b/src/config/zod-schema.agent-runtime.ts
@@ -5,7 +5,7 @@ import {
  GroupChatSchema,
  HumanDelaySchema,
  IdentitySchema,
-  ToolsAudioTranscriptionSchema,
+  ToolsMediaSchema,
 } from "./zod-schema.core.js";
 export const HeartbeatSchema = z
@@ -283,11 +283,7 @@ export const ToolsSchema = z
    deny: z.array(z.string()).optional(),
    byProvider: z.record(z.string(), ToolPolicyWithProfileSchema).optional(),
    web: ToolsWebSchema,
-    audio: z
+    media: ToolsMediaSchema,
      .object({
        transcription: ToolsAudioTranscriptionSchema,
      })
      .optional(),
    message: z
      .object({
        allowCrossContextSend: z.boolean().optional(),
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -240,10 +240,68 @@ export const ExecutableTokenSchema = z
  .string()
  .refine(isSafeExecutableValue, "expected safe executable name or path");
-export const ToolsAudioTranscriptionSchema = z
+export const MediaUnderstandingScopeSchema = z
  .object({
    default: z.union([z.literal("allow"), z.literal("deny")]).optional(),
    rules: z
      .array(
        z.object({
          action: z.union([z.literal("allow"), z.literal("deny")]),
          match: z
            .object({
              channel: z.string().optional(),
              chatType: z
                .union([z.literal("direct"), z.literal("group"), z.literal("room")])
                .optional(),
              keyPrefix: z.string().optional(),
            })
            .optional(),
        }),
      )
      .optional(),
  })
  .optional();
 export const MediaUnderstandingCapabilitiesSchema = z
  .array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")]))
  .optional();
 export const MediaUnderstandingModelSchema = z
  .object({
    provider: z.string().optional(),
    model: z.string().optional(),
    capabilities: MediaUnderstandingCapabilitiesSchema,
    type: z.union([z.literal("provider"), z.literal("cli")]).optional(),
    command: z.string().optional(),
    args: z.array(z.string()).optional(),
    prompt: z.string().optional(),
    maxChars: z.number().int().positive().optional(),
    maxBytes: z.number().int().positive().optional(),
    timeoutSeconds: z.number().int().positive().optional(),
    language: z.string().optional(),
    profile: z.string().optional(),
    preferredProfile: z.string().optional(),
  })
  .optional();
 export const ToolsMediaUnderstandingSchema = z
  .object({
    enabled: z.boolean().optional(),
    scope: MediaUnderstandingScopeSchema,
    maxBytes: z.number().int().positive().optional(),
    maxChars: z.number().int().positive().optional(),
    prompt: z.string().optional(),
    timeoutSeconds: z.number().int().positive().optional(),
    language: z.string().optional(),
    models: z.array(MediaUnderstandingModelSchema).optional(),
  })
  .optional();
 export const ToolsMediaSchema = z
  .object({
    image: ToolsMediaUnderstandingSchema.optional(),
    audio: ToolsMediaUnderstandingSchema.optional(),
    video: ToolsMediaUnderstandingSchema.optional(),
  })
  .optional();
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -0,0 +1,258 @@
 import fs from "node:fs/promises";
 import os from "node:os";
 import path from "node:path";
 import { beforeEach, describe, expect, it, vi } from "vitest";
 import type { ClawdbotConfig } from "../config/config.js";
 import type { MsgContext } from "../auto-reply/templating.js";
 import { resolveApiKeyForProvider } from "../agents/model-auth.js";
 import { fetchRemoteMedia } from "../media/fetch.js";
 vi.mock("../agents/model-auth.js", () => ({
  resolveApiKeyForProvider: vi.fn(async () => ({
    apiKey: "test-key",
    source: "test",
  })),
 }));
 vi.mock("../media/fetch.js", () => ({
  fetchRemoteMedia: vi.fn(),
 }));
 vi.mock("../process/exec.js", () => ({
  runExec: vi.fn(),
 }));
 async function loadApply() {
  return await import("./apply.js");
 }
 describe("applyMediaUnderstanding", () => {
  const mockedResolveApiKey = vi.mocked(resolveApiKeyForProvider);
  const mockedFetchRemoteMedia = vi.mocked(fetchRemoteMedia);
  beforeEach(() => {
    mockedResolveApiKey.mockClear();
    mockedFetchRemoteMedia.mockReset();
    mockedFetchRemoteMedia.mockResolvedValue({
      buffer: Buffer.from("audio-bytes"),
      contentType: "audio/ogg",
      fileName: "note.ogg",
    });
  });
  it("sets Transcript and replaces Body when audio transcription succeeds", async () => {
    const { applyMediaUnderstanding } = await loadApply();
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
    const audioPath = path.join(dir, "note.ogg");
    await fs.writeFile(audioPath, "hello");
    const ctx: MsgContext = {
      Body: "<media:audio>",
      MediaPath: audioPath,
      MediaType: "audio/ogg",
    };
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          audio: {
            enabled: true,
            maxBytes: 1024 * 1024,
            models: [{ provider: "groq" }],
          },
        },
      },
    };
    const result = await applyMediaUnderstanding({
      ctx,
      cfg,
      providers: {
        groq: {
          id: "groq",
          transcribeAudio: async () => ({ text: "transcribed text" }),
        },
      },
    });
    expect(result.appliedAudio).toBe(true);
    expect(ctx.Transcript).toBe("transcribed text");
    expect(ctx.Body).toBe("[Audio]\nTranscript:\ntranscribed text");
    expect(ctx.CommandBody).toBe("transcribed text");
    expect(ctx.RawBody).toBe("transcribed text");
  });
  it("handles URL-only attachments for audio transcription", async () => {
    const { applyMediaUnderstanding } = await loadApply();
    const ctx: MsgContext = {
      Body: "<media:audio>",
      MediaUrl: "https://example.com/note.ogg",
      MediaType: "audio/ogg",
      ChatType: "dm",
    };
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          audio: {
            enabled: true,
            maxBytes: 1024 * 1024,
            scope: {
              default: "deny",
              rules: [{ action: "allow", match: { chatType: "direct" } }],
            },
            models: [{ provider: "groq" }],
          },
        },
      },
    };
    const result = await applyMediaUnderstanding({
      ctx,
      cfg,
      providers: {
        groq: {
          id: "groq",
          transcribeAudio: async () => ({ text: "remote transcript" }),
        },
      },
    });
    expect(result.appliedAudio).toBe(true);
    expect(ctx.Transcript).toBe("remote transcript");
    expect(ctx.Body).toBe("[Audio]\nTranscript:\nremote transcript");
  });
  it("skips audio transcription when attachment exceeds maxBytes", async () => {
    const { applyMediaUnderstanding } = await loadApply();
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
    const audioPath = path.join(dir, "large.wav");
    await fs.writeFile(audioPath, "0123456789");
    const ctx: MsgContext = {
      Body: "<media:audio>",
      MediaPath: audioPath,
      MediaType: "audio/wav",
    };
    const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          audio: {
            enabled: true,
            maxBytes: 4,
            models: [{ provider: "groq" }],
          },
        },
      },
    };
    const result = await applyMediaUnderstanding({
      ctx,
      cfg,
      providers: { groq: { id: "groq", transcribeAudio } },
    });
    expect(result.appliedAudio).toBe(false);
    expect(transcribeAudio).not.toHaveBeenCalled();
    expect(ctx.Body).toBe("<media:audio>");
  });
  it("falls back to CLI model when provider fails", async () => {
    const { applyMediaUnderstanding } = await loadApply();
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
    const audioPath = path.join(dir, "note.ogg");
    await fs.writeFile(audioPath, "hello");
    const ctx: MsgContext = {
      Body: "<media:audio>",
      MediaPath: audioPath,
      MediaType: "audio/ogg",
    };
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          audio: {
            enabled: true,
            models: [
              { provider: "groq" },
              {
                type: "cli",
                command: "whisper",
                args: ["{{MediaPath}}"],
              },
            ],
          },
        },
      },
    };
    const execModule = await import("../process/exec.js");
    vi.mocked(execModule.runExec).mockResolvedValue({
      stdout: "cli transcript\n",
      stderr: "",
    });
    const result = await applyMediaUnderstanding({
      ctx,
      cfg,
      providers: {
        groq: {
          id: "groq",
          transcribeAudio: async () => {
            throw new Error("boom");
          },
        },
      },
    });
    expect(result.appliedAudio).toBe(true);
    expect(ctx.Transcript).toBe("cli transcript");
    expect(ctx.Body).toBe("[Audio]\nTranscript:\ncli transcript");
  });
  it("uses CLI image understanding and preserves caption for commands", async () => {
    const { applyMediaUnderstanding } = await loadApply();
    const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
    const imagePath = path.join(dir, "photo.jpg");
    await fs.writeFile(imagePath, "image-bytes");
    const ctx: MsgContext = {
      Body: "<media:image> show Dom",
      MediaPath: imagePath,
      MediaType: "image/jpeg",
    };
    const cfg: ClawdbotConfig = {
      tools: {
        media: {
          image: {
            enabled: true,
            models: [
              {
                type: "cli",
                command: "gemini",
                args: ["--file", "{{MediaPath}}", "--prompt", "{{Prompt}}"],
              },
            ],
          },
        },
      },
    };
    const execModule = await import("../process/exec.js");
    vi.mocked(execModule.runExec).mockResolvedValue({
      stdout: "image description\n",
      stderr: "",
    });
    const result = await applyMediaUnderstanding({
      ctx,
      cfg,
    });
    expect(result.appliedImage).toBe(true);
    expect(ctx.Body).toBe("[Image]\nUser text:\nshow Dom\nDescription:\nimage description");
    expect(ctx.CommandBody).toBe("show Dom");
    expect(ctx.RawBody).toBe("show Dom");
  });
 });
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -0,0 +1,804 @@
 import crypto from "node:crypto";
 import fs from "node:fs/promises";
 import os from "node:os";
 import path from "node:path";
 import { fileURLToPath } from "node:url";
 import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
 import { complete } from "@mariozechner/pi-ai";
 import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
 import type { ClawdbotConfig } from "../config/config.js";
 import type { MsgContext } from "../auto-reply/templating.js";
 import { applyTemplate } from "../auto-reply/templating.js";
 import { getApiKeyForModel, resolveApiKeyForProvider } from "../agents/model-auth.js";
 import { ensureClawdbotModelsJson } from "../agents/models-config.js";
 import { minimaxUnderstandImage } from "../agents/minimax-vlm.js";
 import { logVerbose, shouldLogVerbose } from "../globals.js";
 import { fetchRemoteMedia } from "../media/fetch.js";
 import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js";
 import { runExec } from "../process/exec.js";
 import type {
  MediaUnderstandingConfig,
  MediaUnderstandingModelConfig,
  MediaUnderstandingScopeConfig,
 } from "../config/types.tools.js";
 import { extractMediaUserText, formatMediaUnderstandingBody } from "./format.js";
 import {
  buildMediaUnderstandingRegistry,
  getMediaUnderstandingProvider,
  normalizeMediaProviderId,
 } from "./providers/index.js";
 import { fetchWithTimeout } from "./providers/shared.js";
 import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js";
 import type {
  MediaAttachment,
  MediaUnderstandingOutput,
  MediaUnderstandingProvider,
 } from "./types.js";
 import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js";
 const MB = 1024 * 1024;
 const DEFAULT_MAX_CHARS = 500;
 const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<Capability, number | undefined> = {
  image: DEFAULT_MAX_CHARS,
  audio: undefined,
  video: DEFAULT_MAX_CHARS,
 };
 const DEFAULT_MAX_BYTES: Record<Capability, number> = {
  image: 10 * MB,
  audio: 20 * MB,
  video: 50 * MB,
 };
 const DEFAULT_TIMEOUT_SECONDS: Record<Capability, number> = {
  image: 60,
  audio: 60,
  video: 120,
 };
 const DEFAULT_PROMPT: Record<Capability, string> = {
  image: "Describe the image.",
  audio: "Transcribe the audio.",
  video: "Describe the video.",
 };
 const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
 const DEFAULT_AUDIO_MODELS: Record<string, string> = {
  groq: "whisper-large-v3-turbo",
  openai: "whisper-1",
 };
 const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
 export type ApplyMediaUnderstandingResult = {
  outputs: MediaUnderstandingOutput[];
  appliedImage: boolean;
  appliedAudio: boolean;
  appliedVideo: boolean;
 };
 type Capability = "image" | "audio" | "video";
 type MediaBufferResult = {
  buffer: Buffer;
  mime?: string;
  fileName: string;
 };
 type MediaPathResult = {
  path: string;
  cleanup?: () => Promise<void> | void;
 };
 function normalizeAttachmentPath(raw?: string | null): string | undefined {
  const value = raw?.trim();
  if (!value) return undefined;
  if (value.startsWith("file://")) {
    try {
      return fileURLToPath(value);
    } catch {
      return undefined;
    }
  }
  return value;
 }
 function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
  const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
  const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
  const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
  const resolveMime = (count: number, index: number) => {
    const typeHint = typesFromArray?.[index];
    const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
    if (trimmed) return trimmed;
    return count === 1 ? ctx.MediaType : undefined;
  };
  if (pathsFromArray && pathsFromArray.length > 0) {
    const count = pathsFromArray.length;
    const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
    return pathsFromArray
      .map((value, index) => ({
        path: value?.trim() || undefined,
        url: urls?.[index] ?? ctx.MediaUrl,
        mime: resolveMime(count, index),
        index,
      }))
      .filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
  }
  if (urlsFromArray && urlsFromArray.length > 0) {
    const count = urlsFromArray.length;
    return urlsFromArray
      .map((value, index) => ({
        path: undefined,
        url: value?.trim() || undefined,
        mime: resolveMime(count, index),
        index,
      }))
      .filter((entry) => Boolean(entry.url?.trim()));
  }
  const pathValue = ctx.MediaPath?.trim();
  const url = ctx.MediaUrl?.trim();
  if (!pathValue && !url) return [];
  return [
    {
      path: pathValue || undefined,
      url: url || undefined,
      mime: ctx.MediaType,
      index: 0,
    },
  ];
 }
 function isVideoAttachment(attachment: MediaAttachment): boolean {
  if (attachment.mime?.startsWith("video/")) return true;
  const ext = getFileExtension(attachment.path ?? attachment.url);
  if (!ext) return false;
  return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext);
 }
 function isAudioAttachment(attachment: MediaAttachment): boolean {
  if (attachment.mime?.startsWith("audio/")) return true;
  return isAudioFileName(attachment.path ?? attachment.url);
 }
 function isImageAttachment(attachment: MediaAttachment): boolean {
  if (attachment.mime?.startsWith("image/")) return true;
  const ext = getFileExtension(attachment.path ?? attachment.url);
  if (!ext) return false;
  return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext);
 }
 function estimateBase64Size(bytes: number): number {
  return Math.ceil(bytes / 3) * 4;
 }
 function resolveVideoMaxBase64Bytes(maxBytes: number): number {
  const expanded = Math.floor(maxBytes * (4 / 3));
  return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
 }
 function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number {
  const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds;
  return Math.max(1000, Math.floor(value * 1000));
 }
 function resolvePrompt(capability: Capability, prompt?: string, maxChars?: number): string {
  const base = prompt?.trim() || DEFAULT_PROMPT[capability];
  if (!maxChars || capability === "audio") return base;
  return `${base} Respond in at most ${maxChars} characters.`;
 }
 function resolveRequestUrl(input: RequestInfo | URL): string {
  if (typeof input === "string") return input;
  if (input instanceof URL) return input.toString();
  return input.url;
 }
 function normalizeErrorMessage(err: unknown): string {
  if (!err) return "";
  if (typeof err === "string") return err;
  if (err instanceof Error) return err.message;
  try {
    return JSON.stringify(err);
  } catch {
    return "";
  }
 }
 function resolveMaxChars(params: {
  capability: Capability;
  entry: MediaUnderstandingModelConfig;
  cfg: ClawdbotConfig;
 }): number | undefined {
  const { capability, entry, cfg } = params;
  const configured = entry.maxChars ?? cfg.tools?.media?.[capability]?.maxChars;
  if (typeof configured === "number") return configured;
  return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability];
 }
 function trimOutput(text: string, maxChars?: number): string {
  const trimmed = text.trim();
  if (!maxChars || trimmed.length <= maxChars) return trimmed;
  return trimmed.slice(0, maxChars).trim();
 }
 function resolveConfigValue<T>(primary: T | undefined, fallback: T): T {
  return primary === undefined ? fallback : primary;
 }
 function resolveCapabilityConfig(
  cfg: ClawdbotConfig,
  capability: Capability,
 ): MediaUnderstandingConfig | undefined {
  return cfg.tools?.media?.[capability];
 }
 function resolveScopeDecision(params: {
  scope?: MediaUnderstandingScopeConfig;
  ctx: MsgContext;
 }): "allow" | "deny" {
  return resolveMediaUnderstandingScope({
    scope: params.scope,
    sessionKey: params.ctx.SessionKey,
    channel: params.ctx.Surface ?? params.ctx.Provider,
    chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType),
  });
 }
 function resolveModelEntries(
  cfg: MediaUnderstandingConfig | undefined,
  capability: Capability,
 ): MediaUnderstandingModelConfig[] {
  const models = cfg?.models ?? [];
  if (models.length === 0) return [];
  return models.filter((entry) => {
    const caps = entry.capabilities;
    if (!caps || caps.length === 0) return true;
    return caps.includes(capability);
  });
 }
 function isMaxBytesError(err: unknown): boolean {
  const message = normalizeErrorMessage(err);
  if (!message) return false;
  return message.includes("exceeds maxBytes") || message.includes("payload exceeds maxBytes");
 }
 async function loadAttachmentBuffer(params: {
  attachment: MediaAttachment;
  maxBytes: number;
  timeoutMs: number;
 }): Promise<MediaBufferResult | undefined> {
  const { attachment, maxBytes, timeoutMs } = params;
  const rawPath = normalizeAttachmentPath(attachment.path);
  if (rawPath) {
    const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
    try {
      const stat = await fs.stat(resolved);
      if (!stat.isFile()) return undefined;
      if (stat.size > maxBytes) {
        if (shouldLogVerbose()) {
          logVerbose(
            `Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
          );
        }
        return undefined;
      }
      const buffer = await fs.readFile(resolved);
      const mime =
        attachment.mime ??
        (await detectMime({
          buffer,
          filePath: resolved,
        }));
      const fileName = path.basename(resolved) || `media-${attachment.index + 1}`;
      return { buffer, mime, fileName };
    } catch (err) {
      if (shouldLogVerbose()) {
        logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
      }
    }
  }
  const url = attachment.url?.trim();
  if (!url) return undefined;
  try {
    const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
      fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
    const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
    if (fetched.buffer.length > maxBytes) {
      if (shouldLogVerbose()) {
        logVerbose(
          `Skipping media attachment ${attachment.index + 1}: ${fetched.buffer.length} bytes exceeds ${maxBytes}`,
        );
      }
      return undefined;
    }
    const mime =
      attachment.mime ??
      fetched.contentType ??
      (await detectMime({
        buffer: fetched.buffer,
        filePath: fetched.fileName ?? url,
      }));
    const fileName = fetched.fileName ?? `media-${attachment.index + 1}`;
    return { buffer: fetched.buffer, mime, fileName };
  } catch (err) {
    if (shouldLogVerbose()) {
      logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
    }
  }
  return undefined;
 }
 async function resolveAttachmentPath(params: {
  attachment: MediaAttachment;
  maxBytes?: number;
  timeoutMs: number;
 }): Promise<MediaPathResult | undefined> {
  const { attachment, maxBytes, timeoutMs } = params;
  const rawPath = normalizeAttachmentPath(attachment.path);
  if (rawPath) {
    const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
    try {
      const stat = await fs.stat(resolved);
      if (!stat.isFile()) return undefined;
      if (maxBytes && stat.size > maxBytes) {
        if (shouldLogVerbose()) {
          logVerbose(
            `Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
          );
        }
        return undefined;
      }
      return { path: resolved };
    } catch (err) {
      if (shouldLogVerbose()) {
        logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
      }
    }
  }
  const url = attachment.url?.trim();
  if (!url) return undefined;
  try {
    const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
      fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
    const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
    const buffer = fetched.buffer;
    if (maxBytes && buffer.length > maxBytes) {
      if (shouldLogVerbose()) {
        logVerbose(
          `Skipping media attachment ${attachment.index + 1}: ${buffer.length} bytes exceeds ${maxBytes}`,
        );
      }
      return undefined;
    }
    const extension = fetched.fileName ? path.extname(fetched.fileName) : "";
    const tmpPath = path.join(
      os.tmpdir(),
      `clawdbot-media-${crypto.randomUUID()}${extension || ""}`,
    );
    await fs.writeFile(tmpPath, buffer);
    return {
      path: tmpPath,
      cleanup: async () => {
        await fs.unlink(tmpPath).catch(() => {});
      },
    };
  } catch (err) {
    if (shouldLogVerbose()) {
      logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
    }
  }
  return undefined;
 }
 async function describeImageWithModel(params: {
  cfg: ClawdbotConfig;
  agentDir: string;
  provider: string;
  model: string;
  prompt: string;
  maxChars?: number;
  buffer: Buffer;
  mimeType: string;
  profile?: string;
  preferredProfile?: string;
 }): Promise<{ text: string; model: string }> {
  await ensureClawdbotModelsJson(params.cfg, params.agentDir);
  const authStorage = discoverAuthStorage(params.agentDir);
  const modelRegistry = discoverModels(authStorage, params.agentDir);
  const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
  if (!model) {
    throw new Error(`Unknown model: ${params.provider}/${params.model}`);
  }
  if (!model.input?.includes("image")) {
    throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
  }
  const apiKeyInfo = await getApiKeyForModel({
    model,
    cfg: params.cfg,
    agentDir: params.agentDir,
    profileId: params.profile,
    preferredProfile: params.preferredProfile,
  });
  authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
  const base64 = params.buffer.toString("base64");
  if (model.provider === "minimax") {
    const text = await minimaxUnderstandImage({
      apiKey: apiKeyInfo.apiKey,
      prompt: params.prompt,
      imageDataUrl: `data:${params.mimeType};base64,${base64}`,
      modelBaseUrl: model.baseUrl,
    });
    return { text, model: model.id };
  }
  const context: Context = {
    messages: [
      {
        role: "user",
        content: [
          { type: "text", text: params.prompt },
          { type: "image", data: base64, mimeType: params.mimeType },
        ],
        timestamp: Date.now(),
      },
    ],
  };
  const message = (await complete(model, context, {
    apiKey: apiKeyInfo.apiKey,
    maxTokens: 512,
  })) as AssistantMessage;
  const text = coerceImageAssistantText({
    message,
    provider: model.provider,
    model: model.id,
  });
  return { text, model: model.id };
 }
 async function runProviderEntry(params: {
  capability: Capability;
  entry: MediaUnderstandingModelConfig;
  cfg: ClawdbotConfig;
  ctx: MsgContext;
  attachment: MediaAttachment;
  agentDir?: string;
  providerRegistry: Map<string, MediaUnderstandingProvider>;
 }): Promise<MediaUnderstandingOutput | null> {
  const { entry, capability, cfg, attachment } = params;
  const providerIdRaw = entry.provider?.trim();
  if (!providerIdRaw) {
    throw new Error(`Provider entry missing provider for ${capability}`);
  }
  const providerId = normalizeMediaProviderId(providerIdRaw);
  const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
  const maxChars = resolveMaxChars({ capability, entry, cfg });
  const timeoutMs = resolveTimeoutMs(
    entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
    DEFAULT_TIMEOUT_SECONDS[capability],
  );
  const prompt = resolvePrompt(
    capability,
    entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
    maxChars,
  );
  if (capability === "image") {
    if (!params.agentDir) {
      throw new Error("Image understanding requires agentDir");
    }
    const modelId = entry.model?.trim();
    if (!modelId) {
      throw new Error("Image understanding requires model id");
    }
    const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
    if (!media) return null;
    const mimeType = media.mime ?? "image/jpeg";
    const result = await describeImageWithModel({
      cfg,
      agentDir: params.agentDir,
      provider: providerId,
      model: modelId,
      prompt,
      maxChars,
      buffer: media.buffer,
      mimeType,
      profile: entry.profile,
      preferredProfile: entry.preferredProfile,
    });
    return {
      kind: "image.description",
      attachmentIndex: attachment.index,
      text: trimOutput(result.text, maxChars),
      provider: providerId,
      model: result.model,
    };
  }
  const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
  if (!provider) {
    throw new Error(`Media provider not available: ${providerId}`);
  }
  if (capability === "audio") {
    if (!provider.transcribeAudio) {
      throw new Error(`Audio transcription provider "${providerId}" not available.`);
    }
    const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
    if (!media) return null;
    const key = await resolveApiKeyForProvider({
      provider: providerId,
      cfg,
      profileId: entry.profile,
      preferredProfile: entry.preferredProfile,
      agentDir: params.agentDir,
    });
    const providerConfig = cfg.models?.providers?.[providerId];
    const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
    const result = await provider.transcribeAudio({
      buffer: media.buffer,
      fileName: media.fileName,
      mime: media.mime,
      apiKey: key.apiKey,
      baseUrl: providerConfig?.baseUrl,
      headers: providerConfig?.headers,
      model,
      language: entry.language ?? cfg.tools?.media?.audio?.language,
      prompt,
      timeoutMs,
    });
    return {
      kind: "audio.transcription",
      attachmentIndex: attachment.index,
      text: trimOutput(result.text, maxChars),
      provider: providerId,
      model: result.model ?? model,
    };
  }
  if (capability === "video") {
    if (!provider.describeVideo) {
      throw new Error(`Video understanding provider "${providerId}" not available.`);
    }
    const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
    if (!media) return null;
    const estimatedBase64Bytes = estimateBase64Size(media.buffer.length);
    const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
    if (estimatedBase64Bytes > maxBase64Bytes) {
      if (shouldLogVerbose()) {
        logVerbose(
          `Skipping video attachment ${attachment.index + 1}: base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
        );
      }
      return null;
    }
    const key = await resolveApiKeyForProvider({
      provider: providerId,
      cfg,
      profileId: entry.profile,
      preferredProfile: entry.preferredProfile,
      agentDir: params.agentDir,
    });
    const providerConfig = cfg.models?.providers?.[providerId];
    const result = await provider.describeVideo({
      buffer: media.buffer,
      fileName: media.fileName,
      mime: media.mime,
      apiKey: key.apiKey,
      baseUrl: providerConfig?.baseUrl,
      headers: providerConfig?.headers,
      model: entry.model,
      prompt,
      timeoutMs,
    });
    return {
      kind: "video.description",
      attachmentIndex: attachment.index,
      text: trimOutput(result.text, maxChars),
      provider: providerId,
      model: result.model ?? entry.model,
    };
  }
  return null;
 }
 async function runCliEntry(params: {
  capability: Capability;
  entry: MediaUnderstandingModelConfig;
  cfg: ClawdbotConfig;
  ctx: MsgContext;
  attachment: MediaAttachment;
 }): Promise<MediaUnderstandingOutput | null> {
  const { entry, capability, cfg, ctx, attachment } = params;
  const command = entry.command?.trim();
  const args = entry.args ?? [];
  if (!command) {
    throw new Error(`CLI entry missing command for ${capability}`);
  }
  const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
  const maxChars = resolveMaxChars({ capability, entry, cfg });
  const timeoutMs = resolveTimeoutMs(
    entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
    DEFAULT_TIMEOUT_SECONDS[capability],
  );
  const prompt = resolvePrompt(
    capability,
    entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
    maxChars,
  );
  const pathResult = await resolveAttachmentPath({
    attachment,
    maxBytes,
    timeoutMs,
  });
  if (!pathResult) return null;
  const templCtx: MsgContext = {
    ...ctx,
    MediaPath: pathResult.path,
    Prompt: prompt,
    MaxChars: maxChars,
  };
  const argv = [command, ...args].map((part, index) =>
    index === 0 ? part : applyTemplate(part, templCtx),
  );
  if (shouldLogVerbose()) {
    logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
  }
  try {
    const { stdout } = await runExec(argv[0], argv.slice(1), {
      timeoutMs,
      maxBuffer: CLI_OUTPUT_MAX_BUFFER,
    });
    const text = trimOutput(stdout, maxChars);
    if (!text) return null;
    return {
      kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
      attachmentIndex: attachment.index,
      text,
      provider: "cli",
      model: command,
    };
  } finally {
    if (pathResult.cleanup) {
      await pathResult.cleanup();
    }
  }
 }
 async function runCapability(params: {
  capability: Capability;
  cfg: ClawdbotConfig;
  ctx: MsgContext;
  attachments: MediaAttachment[];
  agentDir?: string;
  providerRegistry: Map<string, MediaUnderstandingProvider>;
 }): Promise<MediaUnderstandingOutput | null> {
  const { capability, cfg, ctx, attachments } = params;
  const config = resolveCapabilityConfig(cfg, capability);
  if (!config || config.enabled === false) return null;
  const entries = resolveModelEntries(config, capability);
  if (entries.length === 0) return null;
  const scopeDecision = resolveScopeDecision({ scope: config.scope, ctx });
  if (scopeDecision === "deny") {
    if (shouldLogVerbose()) {
      logVerbose(`${capability} understanding disabled by scope policy.`);
    }
    return null;
  }
  const attachment = attachments.find((item) => {
    if (capability === "image") return isImageAttachment(item);
    if (capability === "audio") return isAudioAttachment(item);
    return isVideoAttachment(item);
  });
  if (!attachment) return null;
  for (const entry of entries) {
    try {
      const entryType = entry.type ?? (entry.command ? "cli" : "provider");
      const result =
        entryType === "cli"
          ? await runCliEntry({ capability, entry, cfg, ctx, attachment })
          : await runProviderEntry({
              capability,
              entry,
              cfg,
              ctx,
              attachment,
              agentDir: params.agentDir,
              providerRegistry: params.providerRegistry,
            });
      if (result) return result;
    } catch (err) {
      if (isMaxBytesError(err)) {
        if (shouldLogVerbose()) {
          logVerbose(`Skipping ${capability} model due to size: ${String(err)}`);
        }
      } else if (shouldLogVerbose()) {
        logVerbose(`${capability} understanding failed: ${String(err)}`);
      }
    }
  }
  return null;
 }
 export async function applyMediaUnderstanding(params: {
  ctx: MsgContext;
  cfg: ClawdbotConfig;
  agentDir?: string;
  providers?: Record<string, MediaUnderstandingProvider>;
 }): Promise<ApplyMediaUnderstandingResult> {
  const { ctx, cfg } = params;
  const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body];
  const originalUserText =
    commandCandidates
      .map((value) => extractMediaUserText(value))
      .find((value) => value && value.trim()) ?? undefined;
  const attachments = normalizeAttachments(ctx);
  const providerRegistry = buildMediaUnderstandingRegistry(params.providers);
  const outputs: MediaUnderstandingOutput[] = [];
  const imageOutput = await runCapability({
    capability: "image",
    cfg,
    ctx,
    attachments,
    agentDir: params.agentDir,
    providerRegistry,
  });
  if (imageOutput) outputs.push(imageOutput);
  const audioOutput = await runCapability({
    capability: "audio",
    cfg,
    ctx,
    attachments,
    agentDir: params.agentDir,
    providerRegistry,
  });
  if (audioOutput) outputs.push(audioOutput);
  const videoOutput = await runCapability({
    capability: "video",
    cfg,
    ctx,
    attachments,
    agentDir: params.agentDir,
    providerRegistry,
  });
  if (videoOutput) outputs.push(videoOutput);
  if (outputs.length > 0) {
    ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs });
    const audioResult = outputs.find((output) => output.kind === "audio.transcription");
    if (audioResult) {
      ctx.Transcript = audioResult.text;
      ctx.CommandBody = audioResult.text;
      ctx.RawBody = audioResult.text;
    } else if (originalUserText) {
      ctx.CommandBody = originalUserText;
      ctx.RawBody = originalUserText;
    }
    ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
  }
  return {
    outputs,
    appliedImage: outputs.some((output) => output.kind === "image.description"),
    appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
    appliedVideo: outputs.some((output) => output.kind === "video.description"),
  };
 }
--- a/src/media-understanding/format.test.ts
+++ b/src/media-understanding/format.test.ts
@@ -0,0 +1,91 @@
 import { describe, expect, it } from "vitest";
 import { formatMediaUnderstandingBody } from "./format.js";
 describe("formatMediaUnderstandingBody", () => {
  it("replaces placeholder body with transcript", () => {
    const body = formatMediaUnderstandingBody({
      body: "<media:audio>",
      outputs: [
        {
          kind: "audio.transcription",
          attachmentIndex: 0,
          text: "hello world",
          provider: "groq",
        },
      ],
    });
    expect(body).toBe("[Audio]\nTranscript:\nhello world");
  });
  it("includes user text when body is meaningful", () => {
    const body = formatMediaUnderstandingBody({
      body: "caption here",
      outputs: [
        {
          kind: "audio.transcription",
          attachmentIndex: 0,
          text: "transcribed",
          provider: "groq",
        },
      ],
    });
    expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
  });
  it("strips leading media placeholders from user text", () => {
    const body = formatMediaUnderstandingBody({
      body: "<media:audio> caption here",
      outputs: [
        {
          kind: "audio.transcription",
          attachmentIndex: 0,
          text: "transcribed",
          provider: "groq",
        },
      ],
    });
    expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
  });
  it("keeps user text once when multiple outputs exist", () => {
    const body = formatMediaUnderstandingBody({
      body: "caption here",
      outputs: [
        {
          kind: "audio.transcription",
          attachmentIndex: 0,
          text: "audio text",
          provider: "groq",
        },
        {
          kind: "video.description",
          attachmentIndex: 1,
          text: "video text",
          provider: "google",
        },
      ],
    });
    expect(body).toBe(
      [
        "User text:\ncaption here",
        "[Audio]\nTranscript:\naudio text",
        "[Video]\nDescription:\nvideo text",
      ].join("\n\n"),
    );
  });
  it("formats image outputs", () => {
    const body = formatMediaUnderstandingBody({
      body: "<media:image>",
      outputs: [
        {
          kind: "image.description",
          attachmentIndex: 0,
          text: "a cat",
          provider: "openai",
        },
      ],
    });
    expect(body).toBe("[Image]\nDescription:\na cat");
  });
 });
--- a/src/media-understanding/format.ts
+++ b/src/media-understanding/format.ts
@@ -0,0 +1,77 @@
 import type { MediaUnderstandingOutput } from "./types.js";
 const MEDIA_PLACEHOLDER_RE = /^<media:[^>]+>(\s*\([^)]*\))?$/i;
 const MEDIA_PLACEHOLDER_TOKEN_RE = /^<media:[^>]+>(\s*\([^)]*\))?\s*/i;
 export function extractMediaUserText(body?: string): string | undefined {
  const trimmed = body?.trim() ?? "";
  if (!trimmed) return undefined;
  if (MEDIA_PLACEHOLDER_RE.test(trimmed)) return undefined;
  const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim();
  return cleaned || undefined;
 }
 function formatSection(
  title: "Audio" | "Video" | "Image",
  kind: "Transcript" | "Description",
  text: string,
  userText?: string,
 ): string {
  const lines = [`[${title}]`];
  if (userText) {
    lines.push(`User text:\n${userText}`);
  }
  lines.push(`${kind}:\n${text}`);
  return lines.join("\n");
 }
 export function formatMediaUnderstandingBody(params: {
  body?: string;
  outputs: MediaUnderstandingOutput[];
 }): string {
  const outputs = params.outputs.filter((output) => output.text.trim());
  if (outputs.length === 0) {
    return params.body ?? "";
  }
  const userText = extractMediaUserText(params.body);
  const sections: string[] = [];
  if (userText && outputs.length > 1) {
    sections.push(`User text:\n${userText}`);
  }
  for (const output of outputs) {
    if (output.kind === "audio.transcription") {
      sections.push(
        formatSection(
          "Audio",
          "Transcript",
          output.text,
          outputs.length === 1 ? userText : undefined,
        ),
      );
      continue;
    }
    if (output.kind === "image.description") {
      sections.push(
        formatSection(
          "Image",
          "Description",
          output.text,
          outputs.length === 1 ? userText : undefined,
        ),
      );
      continue;
    }
    sections.push(
      formatSection(
        "Video",
        "Description",
        output.text,
        outputs.length === 1 ? userText : undefined,
      ),
    );
  }
  return sections.join("\n\n").trim();
 }
--- a/src/media-understanding/index.ts
+++ b/src/media-understanding/index.ts
@@ -0,0 +1,9 @@
 export { applyMediaUnderstanding } from "./apply.js";
 export { formatMediaUnderstandingBody } from "./format.js";
 export { resolveMediaUnderstandingScope } from "./scope.js";
 export type {
  MediaAttachment,
  MediaUnderstandingOutput,
  MediaUnderstandingProvider,
  MediaUnderstandingKind,
 } from "./types.js";
--- a/src/media-understanding/providers/google/index.ts
+++ b/src/media-understanding/providers/google/index.ts
@@ -0,0 +1,7 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
 import { describeGeminiVideo } from "./video.js";
 export const googleProvider: MediaUnderstandingProvider = {
  id: "google",
  describeVideo: describeGeminiVideo,
 };
--- a/src/media-understanding/providers/google/video.test.ts
+++ b/src/media-understanding/providers/google/video.test.ts
@@ -0,0 +1,93 @@
 import { describe, expect, it } from "vitest";
 import { describeGeminiVideo } from "./video.js";
 const resolveRequestUrl = (input: RequestInfo | URL) => {
  if (typeof input === "string") return input;
  if (input instanceof URL) return input.toString();
  return input.url;
 };
 describe("describeGeminiVideo", () => {
  it("respects case-insensitive x-goog-api-key overrides", async () => {
    let seenKey: string | null = null;
    const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
      const headers = new Headers(init?.headers);
      seenKey = headers.get("x-goog-api-key");
      return new Response(
        JSON.stringify({
          candidates: [{ content: { parts: [{ text: "video ok" }] } }],
        }),
        { status: 200, headers: { "content-type": "application/json" } },
      );
    };
    const result = await describeGeminiVideo({
      buffer: Buffer.from("video"),
      fileName: "clip.mp4",
      apiKey: "test-key",
      timeoutMs: 1000,
      headers: { "X-Goog-Api-Key": "override" },
      fetchFn,
    });
    expect(seenKey).toBe("override");
    expect(result.text).toBe("video ok");
  });
  it("builds the expected request payload", async () => {
    let seenUrl: string | null = null;
    let seenInit: RequestInit | undefined;
    const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
      seenUrl = resolveRequestUrl(input);
      seenInit = init;
      return new Response(
        JSON.stringify({
          candidates: [
            {
              content: {
                parts: [{ text: "first" }, { text: " second " }, { text: "" }],
              },
            },
          ],
        }),
        { status: 200, headers: { "content-type": "application/json" } },
      );
    };
    const result = await describeGeminiVideo({
      buffer: Buffer.from("video-bytes"),
      fileName: "clip.mp4",
      apiKey: "test-key",
      timeoutMs: 1500,
      baseUrl: "https://example.com/v1beta/",
      model: "gemini-3-pro",
      headers: { "X-Other": "1" },
      fetchFn,
    });
    expect(result.model).toBe("gemini-3-pro-preview");
    expect(result.text).toBe("first\nsecond");
    expect(seenUrl).toBe("https://example.com/v1beta/models/gemini-3-pro-preview:generateContent");
    expect(seenInit?.method).toBe("POST");
    expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
    const headers = new Headers(seenInit?.headers);
    expect(headers.get("x-goog-api-key")).toBe("test-key");
    expect(headers.get("content-type")).toBe("application/json");
    expect(headers.get("x-other")).toBe("1");
    const bodyText =
      typeof seenInit?.body === "string"
        ? seenInit.body
        : Buffer.isBuffer(seenInit?.body)
          ? seenInit.body.toString("utf8")
          : "";
    const body = JSON.parse(bodyText);
    expect(body.contents?.[0]?.parts?.[0]?.text).toBe("Describe the video.");
    expect(body.contents?.[0]?.parts?.[1]?.inline_data?.mime_type).toBe("video/mp4");
    expect(body.contents?.[0]?.parts?.[1]?.inline_data?.data).toBe(
      Buffer.from("video-bytes").toString("base64"),
    );
  });
 });
--- a/src/media-understanding/providers/google/video.ts
+++ b/src/media-understanding/providers/google/video.ts
@@ -0,0 +1,84 @@
 import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
 import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
 import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
 export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
 const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
 const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
 function resolveModel(model?: string): string {
  const trimmed = model?.trim();
  if (!trimmed) return DEFAULT_GOOGLE_VIDEO_MODEL;
  return normalizeGoogleModelId(trimmed);
 }
 function resolvePrompt(prompt?: string): string {
  const trimmed = prompt?.trim();
  return trimmed || DEFAULT_GOOGLE_VIDEO_PROMPT;
 }
 export async function describeGeminiVideo(
  params: VideoDescriptionRequest,
 ): Promise<VideoDescriptionResult> {
  const fetchFn = params.fetchFn ?? fetch;
  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_VIDEO_BASE_URL);
  const model = resolveModel(params.model);
  const url = `${baseUrl}/models/${model}:generateContent`;
  const headers = new Headers(params.headers);
  if (!headers.has("content-type")) {
    headers.set("content-type", "application/json");
  }
  if (!headers.has("x-goog-api-key")) {
    headers.set("x-goog-api-key", params.apiKey);
  }
  const body = {
    contents: [
      {
        role: "user",
        parts: [
          { text: resolvePrompt(params.prompt) },
          {
            inline_data: {
              mime_type: params.mime ?? "video/mp4",
              data: params.buffer.toString("base64"),
            },
          },
        ],
      },
    ],
  };
  const res = await fetchWithTimeout(
    url,
    {
      method: "POST",
      headers,
      body: JSON.stringify(body),
    },
    params.timeoutMs,
    fetchFn,
  );
  if (!res.ok) {
    const detail = await readErrorResponse(res);
    const suffix = detail ? `: ${detail}` : "";
    throw new Error(`Video description failed (HTTP ${res.status})${suffix}`);
  }
  const payload = (await res.json()) as {
    candidates?: Array<{
      content?: { parts?: Array<{ text?: string }> };
    }>;
  };
  const parts = payload.candidates?.[0]?.content?.parts ?? [];
  const text = parts
    .map((part) => part?.text?.trim())
    .filter(Boolean)
    .join("\n");
  if (!text) {
    throw new Error("Video description response missing text");
  }
  return { text, model };
 }
--- a/src/media-understanding/providers/groq/index.ts
+++ b/src/media-understanding/providers/groq/index.ts
@@ -0,0 +1,13 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
 import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js";
 const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1";
 export const groqProvider: MediaUnderstandingProvider = {
  id: "groq",
  transcribeAudio: (req) =>
    transcribeOpenAiCompatibleAudio({
      ...req,
      baseUrl: req.baseUrl ?? DEFAULT_GROQ_AUDIO_BASE_URL,
    }),
 };
--- a/src/media-understanding/providers/index.ts
+++ b/src/media-understanding/providers/index.ts
@@ -0,0 +1,35 @@
 import { normalizeProviderId } from "../../agents/model-selection.js";
 import type { MediaUnderstandingProvider } from "../types.js";
 import { googleProvider } from "./google/index.js";
 import { groqProvider } from "./groq/index.js";
 import { openaiProvider } from "./openai/index.js";
 const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
 export function normalizeMediaProviderId(id: string): string {
  const normalized = normalizeProviderId(id);
  if (normalized === "gemini") return "google";
  return normalized;
 }
 export function buildMediaUnderstandingRegistry(
  overrides?: Record<string, MediaUnderstandingProvider>,
 ): Map<string, MediaUnderstandingProvider> {
  const registry = new Map<string, MediaUnderstandingProvider>();
  for (const provider of PROVIDERS) {
    registry.set(normalizeMediaProviderId(provider.id), provider);
  }
  if (overrides) {
    for (const [key, provider] of Object.entries(overrides)) {
      registry.set(normalizeMediaProviderId(key), provider);
    }
  }
  return registry;
 }
 export function getMediaUnderstandingProvider(
  id: string,
  registry: Map<string, MediaUnderstandingProvider>,
 ): MediaUnderstandingProvider | undefined {
  return registry.get(normalizeMediaProviderId(id));
 }
--- a/src/media-understanding/providers/openai/audio.test.ts
+++ b/src/media-understanding/providers/openai/audio.test.ts
@@ -0,0 +1,86 @@
 import { describe, expect, it } from "vitest";
 import { transcribeOpenAiCompatibleAudio } from "./audio.js";
 const resolveRequestUrl = (input: RequestInfo | URL) => {
  if (typeof input === "string") return input;
  if (input instanceof URL) return input.toString();
  return input.url;
 };
 describe("transcribeOpenAiCompatibleAudio", () => {
  it("respects lowercase authorization header overrides", async () => {
    let seenAuth: string | null = null;
    const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
      const headers = new Headers(init?.headers);
      seenAuth = headers.get("authorization");
      return new Response(JSON.stringify({ text: "ok" }), {
        status: 200,
        headers: { "content-type": "application/json" },
      });
    };
    const result = await transcribeOpenAiCompatibleAudio({
      buffer: Buffer.from("audio"),
      fileName: "note.mp3",
      apiKey: "test-key",
      timeoutMs: 1000,
      headers: { authorization: "Bearer override" },
      fetchFn,
    });
    expect(seenAuth).toBe("Bearer override");
    expect(result.text).toBe("ok");
  });
  it("builds the expected request payload", async () => {
    let seenUrl: string | null = null;
    let seenInit: RequestInit | undefined;
    const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
      seenUrl = resolveRequestUrl(input);
      seenInit = init;
      return new Response(JSON.stringify({ text: "hello" }), {
        status: 200,
        headers: { "content-type": "application/json" },
      });
    };
    const result = await transcribeOpenAiCompatibleAudio({
      buffer: Buffer.from("audio-bytes"),
      fileName: "voice.wav",
      apiKey: "test-key",
      timeoutMs: 1234,
      baseUrl: "https://api.example.com/v1/",
      model: " ",
      language: " en ",
      prompt: " hello ",
      mime: "audio/wav",
      headers: { "X-Custom": "1" },
      fetchFn,
    });
    expect(result.model).toBe("whisper-1");
    expect(result.text).toBe("hello");
    expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions");
    expect(seenInit?.method).toBe("POST");
    expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
    const headers = new Headers(seenInit?.headers);
    expect(headers.get("authorization")).toBe("Bearer test-key");
    expect(headers.get("x-custom")).toBe("1");
    const form = seenInit?.body as FormData;
    expect(form).toBeInstanceOf(FormData);
    expect(form.get("model")).toBe("whisper-1");
    expect(form.get("language")).toBe("en");
    expect(form.get("prompt")).toBe("hello");
    const file = form.get("file") as Blob | { type?: string; name?: string } | null;
    expect(file).not.toBeNull();
    if (file) {
      expect(file.type).toBe("audio/wav");
      if ("name" in file && typeof file.name === "string") {
        expect(file.name).toBe("voice.wav");
      }
    }
  });
 });
--- a/src/media-understanding/providers/openai/audio.ts
+++ b/src/media-understanding/providers/openai/audio.ts
@@ -0,0 +1,61 @@
 import path from "node:path";
 import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
 import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
 export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
 const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1";
 function resolveModel(model?: string): string {
  const trimmed = model?.trim();
  return trimmed || DEFAULT_OPENAI_AUDIO_MODEL;
 }
 export async function transcribeOpenAiCompatibleAudio(
  params: AudioTranscriptionRequest,
 ): Promise<AudioTranscriptionResult> {
  const fetchFn = params.fetchFn ?? fetch;
  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_OPENAI_AUDIO_BASE_URL);
  const url = `${baseUrl}/audio/transcriptions`;
  const model = resolveModel(params.model);
  const form = new FormData();
  const fileName = params.fileName?.trim() || path.basename(params.fileName) || "audio";
  const bytes = new Uint8Array(params.buffer);
  const blob = new Blob([bytes], {
    type: params.mime ?? "application/octet-stream",
  });
  form.append("file", blob, fileName);
  form.append("model", model);
  if (params.language?.trim()) form.append("language", params.language.trim());
  if (params.prompt?.trim()) form.append("prompt", params.prompt.trim());
  const headers = new Headers(params.headers);
  if (!headers.has("authorization")) {
    headers.set("authorization", `Bearer ${params.apiKey}`);
  }
  const res = await fetchWithTimeout(
    url,
    {
      method: "POST",
      headers,
      body: form,
    },
    params.timeoutMs,
    fetchFn,
  );
  if (!res.ok) {
    const detail = await readErrorResponse(res);
    const suffix = detail ? `: ${detail}` : "";
    throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
  }
  const payload = (await res.json()) as { text?: string };
  const text = payload.text?.trim();
  if (!text) {
    throw new Error("Audio transcription response missing text");
  }
  return { text, model };
 }
--- a/src/media-understanding/providers/openai/index.ts
+++ b/src/media-understanding/providers/openai/index.ts
@@ -0,0 +1,7 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
 import { transcribeOpenAiCompatibleAudio } from "./audio.js";
 export const openaiProvider: MediaUnderstandingProvider = {
  id: "openai",
  transcribeAudio: transcribeOpenAiCompatibleAudio,
 };
--- a/src/media-understanding/providers/shared.ts
+++ b/src/media-understanding/providers/shared.ts
@@ -0,0 +1,33 @@
 const MAX_ERROR_CHARS = 300;
 export function normalizeBaseUrl(baseUrl: string | undefined, fallback: string): string {
  const raw = baseUrl?.trim() || fallback;
  return raw.replace(/\/+$/, "");
 }
 export async function fetchWithTimeout(
  url: string,
  init: RequestInit,
  timeoutMs: number,
  fetchFn: typeof fetch,
 ): Promise<Response> {
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
  try {
    return await fetchFn(url, { ...init, signal: controller.signal });
  } finally {
    clearTimeout(timer);
  }
 }
 export async function readErrorResponse(res: Response): Promise<string | undefined> {
  try {
    const text = await res.text();
    const collapsed = text.replace(/\s+/g, " ").trim();
    if (!collapsed) return undefined;
    if (collapsed.length <= MAX_ERROR_CHARS) return collapsed;
    return `${collapsed.slice(0, MAX_ERROR_CHARS)}…`;
  } catch {
    return undefined;
  }
 }
--- a/src/media-understanding/scope.test.ts
+++ b/src/media-understanding/scope.test.ts
@@ -0,0 +1,46 @@
 import { describe, expect, it } from "vitest";
 import { resolveMediaUnderstandingScope } from "./scope.js";
 describe("resolveMediaUnderstandingScope", () => {
  it("defaults to allow when scope is undefined", () => {
    expect(resolveMediaUnderstandingScope({})).toBe("allow");
  });
  it("uses first matching rule", () => {
    const decision = resolveMediaUnderstandingScope({
      scope: {
        default: "deny",
        rules: [
          { action: "allow", match: { channel: "whatsapp" } },
          { action: "deny", match: { channel: "whatsapp", chatType: "direct" } },
        ],
      },
      channel: "whatsapp",
      chatType: "direct",
      sessionKey: "whatsapp:direct:123",
    });
    expect(decision).toBe("allow");
  });
  it("matches keyPrefix when provided", () => {
    const decision = resolveMediaUnderstandingScope({
      scope: {
        default: "deny",
        rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }],
      },
      sessionKey: "agent:main:whatsapp:group:123",
    });
    expect(decision).toBe("allow");
  });
  it("matches keyPrefix case-insensitively", () => {
    const decision = resolveMediaUnderstandingScope({
      scope: {
        default: "deny",
        rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }],
      },
      sessionKey: "AGENT:MAIN:WHATSAPP:GROUP:123",
    });
    expect(decision).toBe("allow");
  });
 });
--- a/src/media-understanding/scope.ts
+++ b/src/media-understanding/scope.ts
@@ -0,0 +1,54 @@
 import type { MediaUnderstandingScopeConfig } from "../config/types.tools.js";
 export type MediaUnderstandingScopeDecision = "allow" | "deny";
 function normalizeDecision(value?: string | null): MediaUnderstandingScopeDecision | undefined {
  const normalized = value?.trim().toLowerCase();
  if (normalized === "allow") return "allow";
  if (normalized === "deny") return "deny";
  return undefined;
 }
 function normalizeMatch(value?: string | null): string | undefined {
  const normalized = value?.trim().toLowerCase();
  return normalized || undefined;
 }
 export function normalizeMediaUnderstandingChatType(raw?: string | null): string | undefined {
  const value = raw?.trim().toLowerCase();
  if (!value) return undefined;
  if (value === "dm" || value === "direct_message" || value === "private") return "direct";
  if (value === "groups") return "group";
  if (value === "channel") return "room";
  return value;
 }
 export function resolveMediaUnderstandingScope(params: {
  scope?: MediaUnderstandingScopeConfig;
  sessionKey?: string;
  channel?: string;
  chatType?: string;
 }): MediaUnderstandingScopeDecision {
  const scope = params.scope;
  if (!scope) return "allow";
  const channel = normalizeMatch(params.channel);
  const chatType = normalizeMatch(params.chatType);
  const sessionKey = normalizeMatch(params.sessionKey) ?? "";
  for (const rule of scope.rules ?? []) {
    if (!rule) continue;
    const action = normalizeDecision(rule.action) ?? "allow";
    const match = rule.match ?? {};
    const matchChannel = normalizeMatch(match.channel);
    const matchChatType = normalizeMatch(match.chatType);
    const matchPrefix = normalizeMatch(match.keyPrefix);
    if (matchChannel && matchChannel !== channel) continue;
    if (matchChatType && matchChatType !== chatType) continue;
    if (matchPrefix && !sessionKey.startsWith(matchPrefix)) continue;
    return action;
  }
  return normalizeDecision(scope.default) ?? "allow";
 }
--- a/src/media-understanding/types.ts
+++ b/src/media-understanding/types.ts
@@ -0,0 +1,62 @@
 export type MediaUnderstandingKind =
  | "audio.transcription"
  | "video.description"
  | "image.description";
 export type MediaAttachment = {
  path?: string;
  url?: string;
  mime?: string;
  index: number;
 };
 export type MediaUnderstandingOutput = {
  kind: MediaUnderstandingKind;
  attachmentIndex: number;
  text: string;
  provider: string;
  model?: string;
 };
 export type AudioTranscriptionRequest = {
  buffer: Buffer;
  fileName: string;
  mime?: string;
  apiKey: string;
  baseUrl?: string;
  headers?: Record<string, string>;
  model?: string;
  language?: string;
  prompt?: string;
  timeoutMs: number;
  fetchFn?: typeof fetch;
 };
 export type AudioTranscriptionResult = {
  text: string;
  model?: string;
 };
 export type VideoDescriptionRequest = {
  buffer: Buffer;
  fileName: string;
  mime?: string;
  apiKey: string;
  baseUrl?: string;
  headers?: Record<string, string>;
  model?: string;
  prompt?: string;
  timeoutMs: number;
  fetchFn?: typeof fetch;
 };
 export type VideoDescriptionResult = {
  text: string;
  model?: string;
 };
 export type MediaUnderstandingProvider = {
  id: string;
  transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
  describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
 };
--- a/src/media/fetch.test.ts
+++ b/src/media/fetch.test.ts
@@ -0,0 +1,47 @@
 import { describe, expect, it } from "vitest";
 import { fetchRemoteMedia } from "./fetch.js";
 function makeStream(chunks: Uint8Array[]) {
  return new ReadableStream<Uint8Array>({
    start(controller) {
      for (const chunk of chunks) {
        controller.enqueue(chunk);
      }
      controller.close();
    },
  });
 }
 describe("fetchRemoteMedia", () => {
  it("rejects when content-length exceeds maxBytes", async () => {
    const fetchImpl = async () =>
      new Response(makeStream([new Uint8Array([1, 2, 3, 4, 5])]), {
        status: 200,
        headers: { "content-length": "5" },
      });
    await expect(
      fetchRemoteMedia({
        url: "https://example.com/file.bin",
        fetchImpl,
        maxBytes: 4,
      }),
    ).rejects.toThrow("exceeds maxBytes");
  });
  it("rejects when streamed payload exceeds maxBytes", async () => {
    const fetchImpl = async () =>
      new Response(makeStream([new Uint8Array([1, 2, 3]), new Uint8Array([4, 5, 6])]), {
        status: 200,
      });
    await expect(
      fetchRemoteMedia({
        url: "https://example.com/file.bin",
        fetchImpl,
        maxBytes: 4,
      }),
    ).rejects.toThrow("exceeds maxBytes");
  });
 });
--- a/src/media/fetch.ts
+++ b/src/media/fetch.ts
@@ -14,6 +14,7 @@ type FetchMediaOptions = {
  url: string;
  fetchImpl?: FetchLike;
  filePathHint?: string;
  maxBytes?: number;
 };
 function stripQuotes(value: string): string {
@@ -51,7 +52,7 @@ async function readErrorBodySnippet(res: Response, maxChars = 200): Promise<stri
 }
 export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<FetchMediaResult> {
-  const { url, fetchImpl, filePathHint } = options;
+  const { url, fetchImpl, filePathHint, maxBytes } = options;
  const fetcher: FetchLike | undefined = fetchImpl ?? globalThis.fetch;
  if (!fetcher) {
    throw new Error("fetch is not available");
@@ -77,7 +78,19 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
    throw new Error(`Failed to fetch media from ${url}${redirected}: ${detail}`);
  }
-  const buffer = Buffer.from(await res.arrayBuffer());
+  const contentLength = res.headers.get("content-length");
  if (maxBytes && contentLength) {
    const length = Number(contentLength);
    if (Number.isFinite(length) && length > maxBytes) {
      throw new Error(
        `Failed to fetch media from ${url}: content length ${length} exceeds maxBytes ${maxBytes}`,
      );
    }
  }
  const buffer = maxBytes
    ? await readResponseWithLimit(res, maxBytes)
    : Buffer.from(await res.arrayBuffer());
  let fileNameFromUrl: string | undefined;
  try {
    const parsed = new URL(url);
@@ -109,3 +122,47 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
    fileName,
  };
 }
 async function readResponseWithLimit(res: Response, maxBytes: number): Promise<Buffer> {
  const body = res.body;
  if (!body || typeof body.getReader !== "function") {
    const fallback = Buffer.from(await res.arrayBuffer());
    if (fallback.length > maxBytes) {
      throw new Error(
        `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
      );
    }
    return fallback;
  }
  const reader = body.getReader();
  const chunks: Uint8Array[] = [];
  let total = 0;
  try {
    while (true) {
      const { done, value } = await reader.read();
      if (done) break;
      if (value?.length) {
        total += value.length;
        if (total > maxBytes) {
          try {
            await reader.cancel();
          } catch {}
          throw new Error(
            `Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
          );
        }
        chunks.push(value);
      }
    }
  } finally {
    try {
      reader.releaseLock();
    } catch {}
  }
  return Buffer.concat(
    chunks.map((chunk) => Buffer.from(chunk)),
    total,
  );
 }