mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-10 23:24:33 +00:00
feat: add inbound media understanding
Co-authored-by: Tristan Manchester <tmanchester96@gmail.com>
This commit is contained in:
@@ -89,6 +89,7 @@
|
|||||||
- Docs: add Date & Time guide and update prompt/timezone configuration docs.
|
- Docs: add Date & Time guide and update prompt/timezone configuration docs.
|
||||||
- Messages: debounce rapid inbound messages across channels with per-connector overrides. (#971) — thanks @juanpablodlc.
|
- Messages: debounce rapid inbound messages across channels with per-connector overrides. (#971) — thanks @juanpablodlc.
|
||||||
- Messages: allow media-only sends (CLI/tool) and show Telegram voice recording status for voice notes. (#957) — thanks @rdev.
|
- Messages: allow media-only sends (CLI/tool) and show Telegram voice recording status for voice notes. (#957) — thanks @rdev.
|
||||||
|
- Media: add optional inbound media understanding for image/audio/video with provider + CLI fallbacks. (#1005) — thanks @tristanmanchester.
|
||||||
- Auth/Status: keep auth profiles sticky per session (rotate on compaction/new), surface provider usage headers in `/status` and `clawdbot models status`, and update docs.
|
- Auth/Status: keep auth profiles sticky per session (rotate on compaction/new), surface provider usage headers in `/status` and `clawdbot models status`, and update docs.
|
||||||
- CLI: add `--json` output for `clawdbot daemon` lifecycle/install commands.
|
- CLI: add `--json` output for `clawdbot daemon` lifecycle/install commands.
|
||||||
- Memory: make `node-llama-cpp` an optional dependency (avoid Node 25 install failures) and improve local-embeddings fallback/errors.
|
- Memory: make `node-llama-cpp` an optional dependency (avoid Node 25 install failures) and improve local-embeddings fallback/errors.
|
||||||
|
|||||||
@@ -124,10 +124,21 @@ Save to `~/.clawdbot/clawdbot.json` and you can DM the bot from that number.
|
|||||||
|
|
||||||
// Tooling
|
// Tooling
|
||||||
tools: {
|
tools: {
|
||||||
audio: {
|
media: {
|
||||||
transcription: {
|
audio: {
|
||||||
args: ["--model", "base", "{{MediaPath}}"],
|
enabled: true,
|
||||||
|
maxBytes: 20971520,
|
||||||
|
models: [
|
||||||
|
{ provider: "openai", model: "whisper-1" },
|
||||||
|
// Optional CLI fallback (Whisper binary):
|
||||||
|
// { type: "cli", command: "whisper", args: ["--model", "base", "{{MediaPath}}"] }
|
||||||
|
],
|
||||||
timeoutSeconds: 120
|
timeoutSeconds: 120
|
||||||
|
},
|
||||||
|
video: {
|
||||||
|
enabled: true,
|
||||||
|
maxBytes: 52428800,
|
||||||
|
models: [{ provider: "google", model: "gemini-3-flash-preview" }]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -1769,6 +1769,58 @@ Legacy: `tools.bash` is still accepted as an alias.
|
|||||||
- `tools.web.fetch.firecrawl.maxAgeMs` (optional)
|
- `tools.web.fetch.firecrawl.maxAgeMs` (optional)
|
||||||
- `tools.web.fetch.firecrawl.timeoutSeconds` (optional)
|
- `tools.web.fetch.firecrawl.timeoutSeconds` (optional)
|
||||||
|
|
||||||
|
`tools.media` configures inbound media understanding (image/audio/video):
|
||||||
|
- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
|
||||||
|
- `enabled`: opt-out switch (default true).
|
||||||
|
- `prompt`: optional prompt override (image/video append a `maxChars` hint automatically).
|
||||||
|
- `maxChars`: max output characters (default 500 for image/video; unset for audio).
|
||||||
|
- `maxBytes`: max media size to send (defaults: image 10MB, audio 20MB, video 50MB).
|
||||||
|
- `timeoutSeconds`: request timeout (defaults: image 60s, audio 60s, video 120s).
|
||||||
|
- `language`: optional audio hint.
|
||||||
|
- `scope`: optional gating (first match wins) with `match.channel`, `match.chatType`, or `match.keyPrefix`.
|
||||||
|
- `models`: ordered list of model entries; failures or oversize media fall back to the next entry.
|
||||||
|
- Each `models[]` entry:
|
||||||
|
- Provider entry (`type: "provider"` or omitted):
|
||||||
|
- `provider`: API provider id (`openai`, `anthropic`, `google`/`gemini`, `groq`, etc).
|
||||||
|
- `model`: model id override (required for image; defaults to `whisper-1`/`whisper-large-v3-turbo` for audio providers, and `gemini-3-flash-preview` for video).
|
||||||
|
- `profile` / `preferredProfile`: auth profile selection.
|
||||||
|
- CLI entry (`type: "cli"`):
|
||||||
|
- `command`: executable to run.
|
||||||
|
- `args`: templated args (supports `{{MediaPath}}`, `{{Prompt}}`, `{{MaxChars}}`, etc).
|
||||||
|
- `capabilities`: optional list (`image`, `audio`, `video`) to gate a shared entry.
|
||||||
|
- `prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language` can be overridden per entry.
|
||||||
|
|
||||||
|
If no models are configured (or `enabled: false`), understanding is skipped; the model still receives the original attachments.
|
||||||
|
|
||||||
|
Provider auth follows the standard model auth order (auth profiles, env vars like `OPENAI_API_KEY`/`GROQ_API_KEY`/`GEMINI_API_KEY`, or `models.providers.*.apiKey`).
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
maxBytes: 20971520,
|
||||||
|
scope: {
|
||||||
|
default: "deny",
|
||||||
|
rules: [{ action: "allow", match: { chatType: "direct" } }]
|
||||||
|
},
|
||||||
|
models: [
|
||||||
|
{ provider: "openai", model: "whisper-1" },
|
||||||
|
{ type: "cli", command: "whisper", args: ["--model", "base", "{{MediaPath}}"] }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
video: {
|
||||||
|
enabled: true,
|
||||||
|
maxBytes: 52428800,
|
||||||
|
models: [{ provider: "google", model: "gemini-3-flash-preview" }]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
`agents.defaults.subagents` configures sub-agent defaults:
|
`agents.defaults.subagents` configures sub-agent defaults:
|
||||||
- `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the caller’s model unless overridden per agent or per call.
|
- `model`: default model for spawned sub-agents (string or `{ primary, fallbacks }`). If omitted, sub-agents inherit the caller’s model unless overridden per agent or per call.
|
||||||
- `maxConcurrent`: max concurrent sub-agent runs (default 1)
|
- `maxConcurrent`: max concurrent sub-agent runs (default 1)
|
||||||
@@ -2848,7 +2900,7 @@ clawdbot dns setup --apply
|
|||||||
|
|
||||||
## Template variables
|
## Template variables
|
||||||
|
|
||||||
Template placeholders are expanded in `tools.audio.transcription.args` (and any future templated argument fields).
|
Template placeholders are expanded in `tools.media.*.models[].args` (and any future templated argument fields).
|
||||||
|
|
||||||
| Variable | Description |
|
| Variable | Description |
|
||||||
|----------|-------------|
|
|----------|-------------|
|
||||||
@@ -2864,6 +2916,8 @@ Template placeholders are expanded in `tools.audio.transcription.args` (and any
|
|||||||
| `{{MediaPath}}` | Local media path (if downloaded) |
|
| `{{MediaPath}}` | Local media path (if downloaded) |
|
||||||
| `{{MediaType}}` | Media type (image/audio/document/…) |
|
| `{{MediaType}}` | Media type (image/audio/document/…) |
|
||||||
| `{{Transcript}}` | Audio transcript (when enabled) |
|
| `{{Transcript}}` | Audio transcript (when enabled) |
|
||||||
|
| `{{Prompt}}` | Resolved media prompt for CLI entries |
|
||||||
|
| `{{MaxChars}}` | Resolved max output chars for CLI entries |
|
||||||
| `{{ChatType}}` | `"direct"` or `"group"` |
|
| `{{ChatType}}` | `"direct"` or `"group"` |
|
||||||
| `{{GroupSubject}}` | Group subject (best effort) |
|
| `{{GroupSubject}}` | Group subject (best effort) |
|
||||||
| `{{GroupMembers}}` | Group members preview (best effort) |
|
| `{{GroupMembers}}` | Group members preview (best effort) |
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ Current migrations:
|
|||||||
- `routing.bindings` → top-level `bindings`
|
- `routing.bindings` → top-level `bindings`
|
||||||
- `routing.agents`/`routing.defaultAgentId` → `agents.list` + `agents.list[].default`
|
- `routing.agents`/`routing.defaultAgentId` → `agents.list` + `agents.list[].default`
|
||||||
- `routing.agentToAgent` → `tools.agentToAgent`
|
- `routing.agentToAgent` → `tools.agentToAgent`
|
||||||
- `routing.transcribeAudio` → `tools.audio.transcription`
|
- `routing.transcribeAudio` → `tools.media.audio.models`
|
||||||
- `bindings[].match.accountID` → `bindings[].match.accountId`
|
- `bindings[].match.accountID` → `bindings[].match.accountId`
|
||||||
- `identity` → `agents.list[].identity`
|
- `identity` → `agents.list[].identity`
|
||||||
- `agent.*` → `agents.defaults` + `tools.*` (tools/elevated/exec/sandbox/subagents)
|
- `agent.*` → `agents.defaults` + `tools.*` (tools/elevated/exec/sandbox/subagents)
|
||||||
|
|||||||
@@ -3,25 +3,59 @@ summary: "How inbound audio/voice notes are downloaded, transcribed, and injecte
|
|||||||
read_when:
|
read_when:
|
||||||
- Changing audio transcription or media handling
|
- Changing audio transcription or media handling
|
||||||
---
|
---
|
||||||
# Audio / Voice Notes — 2025-12-05
|
# Audio / Voice Notes — 2026-01-17
|
||||||
|
|
||||||
## What works
|
## What works
|
||||||
- **Optional transcription**: If `tools.audio.transcription` is set in `~/.clawdbot/clawdbot.json`, Clawdbot will:
|
- **Media understanding (audio)**: If `tools.media.audio` is enabled and has `models`, Clawdbot:
|
||||||
1) Download inbound audio to a temp path when WhatsApp only provides a URL.
|
1) Locates the first audio attachment (local path or URL) and downloads it if needed.
|
||||||
2) Run the configured CLI args (templated with `{{MediaPath}}`), expecting transcript on stdout.
|
2) Enforces `maxBytes` before sending to each model entry.
|
||||||
3) Replace `Body` with the transcript, set `{{Transcript}}`, and prepend the original media path plus a `Transcript:` section in the command prompt so models see both.
|
3) Runs the first eligible model entry in order (provider or CLI).
|
||||||
4) Continue through the normal auto-reply pipeline (templating, sessions, Pi command).
|
4) If it fails or skips (size/timeout), it tries the next entry.
|
||||||
- **Verbose logging**: In `--verbose`, we log when transcription runs and when the transcript replaces the body.
|
5) On success, it replaces `Body` with an `[Audio]` block and sets `{{Transcript}}`.
|
||||||
|
- **Command parsing**: When transcription succeeds, `CommandBody`/`RawBody` are set to the transcript so slash commands still work.
|
||||||
|
- **Verbose logging**: In `--verbose`, we log when transcription runs and when it replaces the body.
|
||||||
|
|
||||||
## Config example (Whisper CLI)
|
## Config examples
|
||||||
Requires `whisper` CLI installed:
|
|
||||||
|
### Provider + CLI fallback (OpenAI + Whisper CLI)
|
||||||
```json5
|
```json5
|
||||||
{
|
{
|
||||||
tools: {
|
tools: {
|
||||||
audio: {
|
media: {
|
||||||
transcription: {
|
audio: {
|
||||||
args: ["--model", "base", "{{MediaPath}}"],
|
enabled: true,
|
||||||
timeoutSeconds: 45
|
maxBytes: 20971520,
|
||||||
|
models: [
|
||||||
|
{ provider: "openai", model: "whisper-1" },
|
||||||
|
{
|
||||||
|
type: "cli",
|
||||||
|
command: "whisper",
|
||||||
|
args: ["--model", "base", "{{MediaPath}}"],
|
||||||
|
timeoutSeconds: 45
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Provider-only with scope gating
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
scope: {
|
||||||
|
default: "allow",
|
||||||
|
rules: [
|
||||||
|
{ action: "deny", match: { chatType: "group" } }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
models: [
|
||||||
|
{ provider: "openai", model: "whisper-1" }
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -29,12 +63,13 @@ Requires `whisper` CLI installed:
|
|||||||
```
|
```
|
||||||
|
|
||||||
## Notes & limits
|
## Notes & limits
|
||||||
- We don’t ship a transcriber; you opt in with the Whisper CLI on your PATH.
|
- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
|
||||||
- Size guard: inbound audio must be ≤5 MB (matches the temp media store and transcript pipeline).
|
- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
|
||||||
- Outbound caps: web send supports audio/voice up to 16 MB (sent as a voice note with `ptt: true`).
|
- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
|
||||||
- If transcription fails, we fall back to the original body/media note; replies still go through.
|
- Transcript is available to templates as `{{Transcript}}`.
|
||||||
- Transcript is available to templates as `{{Transcript}}`; models get both the media path and a `Transcript:` block in the prompt when using command mode.
|
- CLI stdout is capped (5MB); keep CLI output concise.
|
||||||
|
|
||||||
## Gotchas
|
## Gotchas
|
||||||
|
- Scope rules use first-match wins. `chatType` is normalized to `direct`, `group`, or `room`.
|
||||||
- Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`.
|
- Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`.
|
||||||
- Keep timeouts reasonable (`timeoutSeconds`, default 45s) to avoid blocking the reply queue.
|
- Keep timeouts reasonable (`timeoutSeconds`, default 60s) to avoid blocking the reply queue.
|
||||||
|
|||||||
@@ -38,13 +38,23 @@ The WhatsApp channel runs via **Baileys Web**. This document captures the curren
|
|||||||
- `{{MediaUrl}}` pseudo-URL for the inbound media.
|
- `{{MediaUrl}}` pseudo-URL for the inbound media.
|
||||||
- `{{MediaPath}}` local temp path written before running the command.
|
- `{{MediaPath}}` local temp path written before running the command.
|
||||||
- When a per-session Docker sandbox is enabled, inbound media is copied into the sandbox workspace and `MediaPath`/`MediaUrl` are rewritten to a relative path like `media/inbound/<filename>`.
|
- When a per-session Docker sandbox is enabled, inbound media is copied into the sandbox workspace and `MediaPath`/`MediaUrl` are rewritten to a relative path like `media/inbound/<filename>`.
|
||||||
- Audio transcription (if configured via `tools.audio.transcription`) runs before templating and can replace `Body` with the transcript.
|
- Media understanding (if configured via `tools.media.*`) runs before templating and can insert `[Image]`, `[Audio]`, and `[Video]` blocks into `Body`.
|
||||||
|
- Audio sets `{{Transcript}}` and uses the transcript for command parsing so slash commands still work.
|
||||||
|
- Video and image descriptions preserve any caption text for command parsing.
|
||||||
|
- Only the first matching image/audio/video attachment is processed; remaining attachments are left untouched.
|
||||||
|
|
||||||
## Limits & Errors
|
## Limits & Errors
|
||||||
|
**Outbound send caps (WhatsApp web send)**
|
||||||
- Images: ~6 MB cap after recompression.
|
- Images: ~6 MB cap after recompression.
|
||||||
- Audio/voice/video: 16 MB cap; documents: 100 MB cap.
|
- Audio/voice/video: 16 MB cap; documents: 100 MB cap.
|
||||||
- Oversize or unreadable media → clear error in logs and the reply is skipped.
|
- Oversize or unreadable media → clear error in logs and the reply is skipped.
|
||||||
|
|
||||||
|
**Media understanding caps (transcription/description)**
|
||||||
|
- Image default: 10 MB (`tools.media.image.maxBytes`).
|
||||||
|
- Audio default: 20 MB (`tools.media.audio.maxBytes`).
|
||||||
|
- Video default: 50 MB (`tools.media.video.maxBytes`).
|
||||||
|
- Oversize media skips understanding, but replies still go through with the original body.
|
||||||
|
|
||||||
## Notes for Tests
|
## Notes for Tests
|
||||||
- Cover send + reply flows for image/audio/document cases.
|
- Cover send + reply flows for image/audio/document cases.
|
||||||
- Validate recompression for images (size bound) and voice-note flag for audio.
|
- Validate recompression for images (size bound) and voice-note flag for audio.
|
||||||
|
|||||||
217
docs/nodes/media-understanding.md
Normal file
217
docs/nodes/media-understanding.md
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
---
|
||||||
|
summary: "Inbound image/audio/video understanding (optional) with provider + CLI fallbacks"
|
||||||
|
read_when:
|
||||||
|
- Designing or refactoring media understanding
|
||||||
|
- Tuning inbound audio/video/image preprocessing
|
||||||
|
---
|
||||||
|
# Media Understanding (Inbound) — 2026-01-17
|
||||||
|
|
||||||
|
Clawdbot can optionally **summarize inbound media** (image/audio/video) before the reply pipeline runs. This is **opt-in** and separate from the base attachment flow—if understanding is off, models still receive the original files/URLs as usual.
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
- Optional: pre‑digest inbound media into short text for faster routing + better command parsing.
|
||||||
|
- Preserve original media delivery to the model (always).
|
||||||
|
- Support **provider APIs** and **CLI fallbacks**.
|
||||||
|
- Allow multiple models with ordered fallback (error/size/timeout).
|
||||||
|
|
||||||
|
## High‑level behavior
|
||||||
|
1) Collect inbound attachments (`MediaPaths`, `MediaUrls`, `MediaTypes`).
|
||||||
|
2) For each enabled capability (image/audio/video), pick the **first matching attachment**.
|
||||||
|
3) Choose the first eligible model entry (size + capability + auth).
|
||||||
|
4) If a model fails or the media is too large, **fall back to the next entry**.
|
||||||
|
5) On success:
|
||||||
|
- `Body` becomes `[Image]`, `[Audio]`, or `[Video]` block.
|
||||||
|
- Audio sets `{{Transcript}}` and `CommandBody`/`RawBody` for command parsing.
|
||||||
|
- Captions are preserved as `User text:` inside the block.
|
||||||
|
|
||||||
|
If understanding fails or is disabled, **the reply flow continues** with the original body + attachments.
|
||||||
|
|
||||||
|
## Config overview
|
||||||
|
Use **per‑capability configs** under `tools.media`. Each capability can define:
|
||||||
|
- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
|
||||||
|
- **ordered `models` list** (fallback order)
|
||||||
|
- `scope` (optional gating by channel/chatType/session key)
|
||||||
|
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
image: { /* config */ },
|
||||||
|
audio: { /* config */ },
|
||||||
|
video: { /* config */ }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Model entries
|
||||||
|
Each `models[]` entry can be **provider** or **CLI**:
|
||||||
|
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
type: "provider", // default if omitted
|
||||||
|
provider: "openai",
|
||||||
|
model: "gpt-5.2",
|
||||||
|
prompt: "Describe the image in <= 500 chars.",
|
||||||
|
maxChars: 500,
|
||||||
|
maxBytes: 10485760,
|
||||||
|
timeoutSeconds: 60,
|
||||||
|
capabilities: ["image"], // optional, used for multi‑modal entries
|
||||||
|
profile: "vision-profile",
|
||||||
|
preferredProfile: "vision-fallback"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
type: "cli",
|
||||||
|
command: "gemini",
|
||||||
|
args: [
|
||||||
|
"-m",
|
||||||
|
"gemini-3-flash",
|
||||||
|
"--allowed-tools",
|
||||||
|
"read_file",
|
||||||
|
"Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters."
|
||||||
|
],
|
||||||
|
maxChars: 500,
|
||||||
|
maxBytes: 52428800,
|
||||||
|
timeoutSeconds: 120,
|
||||||
|
capabilities: ["video", "image"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Defaults and limits
|
||||||
|
Recommended defaults:
|
||||||
|
- `maxChars`: **500** for image/video (short, command‑friendly)
|
||||||
|
- `maxChars`: **unset** for audio (full transcript unless you set a limit)
|
||||||
|
- `maxBytes`:
|
||||||
|
- image: **10MB**
|
||||||
|
- audio: **20MB**
|
||||||
|
- video: **50MB**
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- If media exceeds `maxBytes`, that model is skipped and the **next model is tried**.
|
||||||
|
- If the model returns more than `maxChars`, output is trimmed.
|
||||||
|
- `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
|
||||||
|
|
||||||
|
## Capabilities (optional)
|
||||||
|
If you set `capabilities`, the entry only runs for those media types. Suggested
|
||||||
|
defaults when you opt in:
|
||||||
|
- `openai`, `anthropic`: **image**
|
||||||
|
- `google` (Gemini API): **image + audio + video**
|
||||||
|
- CLI entries: declare the exact capabilities you support.
|
||||||
|
|
||||||
|
If you omit `capabilities`, the entry is eligible for the list it appears in.
|
||||||
|
|
||||||
|
## Provider support matrix (Clawdbot integrations)
|
||||||
|
| Capability | Provider integration | Notes |
|
||||||
|
|------------|----------------------|-------|
|
||||||
|
| Image | OpenAI / Anthropic / Google / others via `pi-ai` | Any image-capable model in the registry works. |
|
||||||
|
| Audio | OpenAI, Groq | Provider transcription (Whisper). |
|
||||||
|
| Video | Google (Gemini API) | Provider video understanding. |
|
||||||
|
|
||||||
|
## Recommended providers
|
||||||
|
**Image**
|
||||||
|
- Prefer your active model if it supports images.
|
||||||
|
- Good defaults: `openai/gpt-5.2`, `anthropic/claude-opus-4-5`, `google/gemini-3-pro-preview`.
|
||||||
|
|
||||||
|
**Audio**
|
||||||
|
- `openai/whisper-1` or `groq/whisper-large-v3-turbo`.
|
||||||
|
- CLI fallback: `whisper` binary.
|
||||||
|
|
||||||
|
**Video**
|
||||||
|
- `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer).
|
||||||
|
- CLI fallback: `gemini` CLI (supports `read_file` on video/audio).
|
||||||
|
|
||||||
|
## Config examples
|
||||||
|
|
||||||
|
### 1) Audio + Video only (image off)
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
models: [
|
||||||
|
{ provider: "openai", model: "whisper-1" },
|
||||||
|
{
|
||||||
|
type: "cli",
|
||||||
|
command: "whisper",
|
||||||
|
args: ["--model", "base", "{{MediaPath}}"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
video: {
|
||||||
|
enabled: true,
|
||||||
|
maxChars: 500,
|
||||||
|
models: [
|
||||||
|
{ provider: "google", model: "gemini-3-flash-preview" },
|
||||||
|
{
|
||||||
|
type: "cli",
|
||||||
|
command: "gemini",
|
||||||
|
args: [
|
||||||
|
"-m",
|
||||||
|
"gemini-3-flash",
|
||||||
|
"--allowed-tools",
|
||||||
|
"read_file",
|
||||||
|
"Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2) Optional image understanding
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
image: {
|
||||||
|
enabled: true,
|
||||||
|
maxBytes: 10485760,
|
||||||
|
maxChars: 500,
|
||||||
|
models: [
|
||||||
|
{ provider: "openai", model: "gpt-5.2" },
|
||||||
|
{ provider: "anthropic", model: "claude-opus-4-5" },
|
||||||
|
{
|
||||||
|
type: "cli",
|
||||||
|
command: "gemini",
|
||||||
|
args: [
|
||||||
|
"-m",
|
||||||
|
"gemini-3-flash",
|
||||||
|
"--allowed-tools",
|
||||||
|
"read_file",
|
||||||
|
"Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3) Multi‑modal single entry (explicit capabilities)
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
image: { models: [{ provider: "google", model: "gemini-3-pro-preview", capabilities: ["image", "video", "audio"] }] },
|
||||||
|
audio: { models: [{ provider: "google", model: "gemini-3-pro-preview", capabilities: ["image", "video", "audio"] }] },
|
||||||
|
video: { models: [{ provider: "google", model: "gemini-3-pro-preview", capabilities: ["image", "video", "audio"] }] }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
- Understanding is **best‑effort**. Errors do not block replies.
|
||||||
|
- Attachments are still passed to models even when understanding is disabled.
|
||||||
|
- Use `scope` to limit where understanding runs (e.g. only DMs).
|
||||||
|
|
||||||
|
## Related docs
|
||||||
|
- [Configuration](/gateway/configuration)
|
||||||
|
- [Image & Media Support](/nodes/images)
|
||||||
@@ -254,7 +254,10 @@ export function createExecTool(
|
|||||||
let yielded = false;
|
let yielded = false;
|
||||||
let yieldTimer: NodeJS.Timeout | null = null;
|
let yieldTimer: NodeJS.Timeout | null = null;
|
||||||
let timeoutTimer: NodeJS.Timeout | null = null;
|
let timeoutTimer: NodeJS.Timeout | null = null;
|
||||||
|
let timeoutFinalizeTimer: NodeJS.Timeout | null = null;
|
||||||
let timedOut = false;
|
let timedOut = false;
|
||||||
|
const timeoutFinalizeMs = 1000;
|
||||||
|
let rejectFn: ((err: Error) => void) | null = null;
|
||||||
|
|
||||||
const settle = (fn: () => void) => {
|
const settle = (fn: () => void) => {
|
||||||
if (settled) return;
|
if (settled) return;
|
||||||
@@ -262,6 +265,18 @@ export function createExecTool(
|
|||||||
fn();
|
fn();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const effectiveTimeout =
|
||||||
|
typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec;
|
||||||
|
const finalizeTimeout = () => {
|
||||||
|
if (session.exited) return;
|
||||||
|
markExited(session, null, "SIGKILL", "failed");
|
||||||
|
if (settled || !rejectFn) return;
|
||||||
|
const aggregated = session.aggregated.trim();
|
||||||
|
const reason = `Command timed out after ${effectiveTimeout} seconds`;
|
||||||
|
const message = aggregated ? `${aggregated}\n\n${reason}` : reason;
|
||||||
|
settle(() => rejectFn?.(new Error(message)));
|
||||||
|
};
|
||||||
|
|
||||||
// Tool-call abort should not kill backgrounded sessions; timeouts still must.
|
// Tool-call abort should not kill backgrounded sessions; timeouts still must.
|
||||||
const onAbortSignal = () => {
|
const onAbortSignal = () => {
|
||||||
if (yielded || session.backgrounded) return;
|
if (yielded || session.backgrounded) return;
|
||||||
@@ -272,15 +287,17 @@ export function createExecTool(
|
|||||||
const onTimeout = () => {
|
const onTimeout = () => {
|
||||||
timedOut = true;
|
timedOut = true;
|
||||||
killSession(session);
|
killSession(session);
|
||||||
|
if (!timeoutFinalizeTimer) {
|
||||||
|
timeoutFinalizeTimer = setTimeout(() => {
|
||||||
|
finalizeTimeout();
|
||||||
|
}, timeoutFinalizeMs);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (signal?.aborted) onAbortSignal();
|
if (signal?.aborted) onAbortSignal();
|
||||||
else if (signal) {
|
else if (signal) {
|
||||||
signal.addEventListener("abort", onAbortSignal, { once: true });
|
signal.addEventListener("abort", onAbortSignal, { once: true });
|
||||||
}
|
}
|
||||||
|
|
||||||
const effectiveTimeout =
|
|
||||||
typeof params.timeout === "number" ? params.timeout : defaultTimeoutSec;
|
|
||||||
if (effectiveTimeout > 0) {
|
if (effectiveTimeout > 0) {
|
||||||
timeoutTimer = setTimeout(() => {
|
timeoutTimer = setTimeout(() => {
|
||||||
onTimeout();
|
onTimeout();
|
||||||
@@ -321,6 +338,7 @@ export function createExecTool(
|
|||||||
});
|
});
|
||||||
|
|
||||||
return new Promise<AgentToolResult<ExecToolDetails>>((resolve, reject) => {
|
return new Promise<AgentToolResult<ExecToolDetails>>((resolve, reject) => {
|
||||||
|
rejectFn = reject;
|
||||||
const resolveRunning = () => {
|
const resolveRunning = () => {
|
||||||
settle(() =>
|
settle(() =>
|
||||||
resolve({
|
resolve({
|
||||||
@@ -369,6 +387,7 @@ export function createExecTool(
|
|||||||
const handleExit = (code: number | null, exitSignal: NodeJS.Signals | number | null) => {
|
const handleExit = (code: number | null, exitSignal: NodeJS.Signals | number | null) => {
|
||||||
if (yieldTimer) clearTimeout(yieldTimer);
|
if (yieldTimer) clearTimeout(yieldTimer);
|
||||||
if (timeoutTimer) clearTimeout(timeoutTimer);
|
if (timeoutTimer) clearTimeout(timeoutTimer);
|
||||||
|
if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer);
|
||||||
const durationMs = Date.now() - startedAt;
|
const durationMs = Date.now() - startedAt;
|
||||||
const wasSignal = exitSignal != null;
|
const wasSignal = exitSignal != null;
|
||||||
const isSuccess = code === 0 && !wasSignal && !signal?.aborted && !timedOut;
|
const isSuccess = code === 0 && !wasSignal && !signal?.aborted && !timedOut;
|
||||||
@@ -421,6 +440,7 @@ export function createExecTool(
|
|||||||
child.once("error", (err) => {
|
child.once("error", (err) => {
|
||||||
if (yieldTimer) clearTimeout(yieldTimer);
|
if (yieldTimer) clearTimeout(yieldTimer);
|
||||||
if (timeoutTimer) clearTimeout(timeoutTimer);
|
if (timeoutTimer) clearTimeout(timeoutTimer);
|
||||||
|
if (timeoutFinalizeTimer) clearTimeout(timeoutFinalizeTimer);
|
||||||
markExited(session, null, null, "failed");
|
markExited(session, null, null, "failed");
|
||||||
settle(() => reject(err));
|
settle(() => reject(err));
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -25,4 +25,20 @@ describe("buildInboundMediaNote", () => {
|
|||||||
].join("\n"),
|
].join("\n"),
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("skips media notes for attachments with understanding output", () => {
|
||||||
|
const note = buildInboundMediaNote({
|
||||||
|
MediaPaths: ["/tmp/a.png", "/tmp/b.png"],
|
||||||
|
MediaUrls: ["https://example.com/a.png", "https://example.com/b.png"],
|
||||||
|
MediaUnderstanding: [
|
||||||
|
{
|
||||||
|
kind: "audio.transcription",
|
||||||
|
attachmentIndex: 0,
|
||||||
|
text: "hello",
|
||||||
|
provider: "groq",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -18,6 +18,12 @@ function formatMediaAttachedLine(params: {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
|
export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
|
||||||
|
// Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
|
||||||
|
const suppressed = new Set(
|
||||||
|
Array.isArray(ctx.MediaUnderstanding)
|
||||||
|
? ctx.MediaUnderstanding.map((output) => output.attachmentIndex)
|
||||||
|
: [],
|
||||||
|
);
|
||||||
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
|
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
|
||||||
const paths =
|
const paths =
|
||||||
pathsFromArray && pathsFromArray.length > 0
|
pathsFromArray && pathsFromArray.length > 0
|
||||||
@@ -36,24 +42,33 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
|
|||||||
? ctx.MediaTypes
|
? ctx.MediaTypes
|
||||||
: undefined;
|
: undefined;
|
||||||
|
|
||||||
if (paths.length === 1) {
|
const entries = paths
|
||||||
|
.map((entry, index) => ({
|
||||||
|
path: entry ?? "",
|
||||||
|
type: types?.[index] ?? ctx.MediaType,
|
||||||
|
url: urls?.[index] ?? ctx.MediaUrl,
|
||||||
|
index,
|
||||||
|
}))
|
||||||
|
.filter((entry) => !suppressed.has(entry.index));
|
||||||
|
if (entries.length === 0) return undefined;
|
||||||
|
if (entries.length === 1) {
|
||||||
return formatMediaAttachedLine({
|
return formatMediaAttachedLine({
|
||||||
path: paths[0] ?? "",
|
path: entries[0]?.path ?? "",
|
||||||
type: types?.[0] ?? ctx.MediaType,
|
type: entries[0]?.type,
|
||||||
url: urls?.[0] ?? ctx.MediaUrl,
|
url: entries[0]?.url,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const count = paths.length;
|
const count = entries.length;
|
||||||
const lines: string[] = [`[media attached: ${count} files]`];
|
const lines: string[] = [`[media attached: ${count} files]`];
|
||||||
for (const [idx, mediaPath] of paths.entries()) {
|
for (const [idx, entry] of entries.entries()) {
|
||||||
lines.push(
|
lines.push(
|
||||||
formatMediaAttachedLine({
|
formatMediaAttachedLine({
|
||||||
path: mediaPath,
|
path: entry.path,
|
||||||
index: idx + 1,
|
index: idx + 1,
|
||||||
total: count,
|
total: count,
|
||||||
type: types?.[idx],
|
type: entry.type,
|
||||||
url: urls?.[idx],
|
url: entry.url,
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -122,6 +122,7 @@ export async function resolveReplyDirectives(params: {
|
|||||||
const commandSource =
|
const commandSource =
|
||||||
sessionCtx.CommandBody ??
|
sessionCtx.CommandBody ??
|
||||||
sessionCtx.RawBody ??
|
sessionCtx.RawBody ??
|
||||||
|
sessionCtx.Transcript ??
|
||||||
sessionCtx.BodyStripped ??
|
sessionCtx.BodyStripped ??
|
||||||
sessionCtx.Body ??
|
sessionCtx.Body ??
|
||||||
"";
|
"";
|
||||||
|
|||||||
@@ -87,7 +87,6 @@ type RunPreparedReplyParams = {
|
|||||||
cap?: number;
|
cap?: number;
|
||||||
dropPolicy?: InlineDirectives["dropPolicy"];
|
dropPolicy?: InlineDirectives["dropPolicy"];
|
||||||
};
|
};
|
||||||
transcribedText?: string;
|
|
||||||
typing: TypingController;
|
typing: TypingController;
|
||||||
opts?: GetReplyOptions;
|
opts?: GetReplyOptions;
|
||||||
defaultModel: string;
|
defaultModel: string;
|
||||||
@@ -210,7 +209,6 @@ export async function runPreparedReply(
|
|||||||
model,
|
model,
|
||||||
perMessageQueueMode,
|
perMessageQueueMode,
|
||||||
perMessageQueueOptions,
|
perMessageQueueOptions,
|
||||||
transcribedText,
|
|
||||||
typing,
|
typing,
|
||||||
opts,
|
opts,
|
||||||
defaultModel,
|
defaultModel,
|
||||||
@@ -325,11 +323,7 @@ export async function runPreparedReply(
|
|||||||
sessionEntry = skillResult.sessionEntry ?? sessionEntry;
|
sessionEntry = skillResult.sessionEntry ?? sessionEntry;
|
||||||
currentSystemSent = skillResult.systemSent;
|
currentSystemSent = skillResult.systemSent;
|
||||||
const skillsSnapshot = skillResult.skillsSnapshot;
|
const skillsSnapshot = skillResult.skillsSnapshot;
|
||||||
const prefixedBody = transcribedText
|
const prefixedBody = [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
|
||||||
? [threadStarterNote, prefixedBodyBase, `Transcript:\n${transcribedText}`]
|
|
||||||
.filter(Boolean)
|
|
||||||
.join("\n\n")
|
|
||||||
: [threadStarterNote, prefixedBodyBase].filter(Boolean).join("\n\n");
|
|
||||||
const mediaNote = buildInboundMediaNote(ctx);
|
const mediaNote = buildInboundMediaNote(ctx);
|
||||||
const mediaReplyHint = mediaNote
|
const mediaReplyHint = mediaNote
|
||||||
? "To send an image back, add a line like: MEDIA:https://example.com/image.jpg (no spaces). Keep caption in the text body."
|
? "To send an image back, add a line like: MEDIA:https://example.com/image.jpg (no spaces). Keep caption in the text body."
|
||||||
@@ -370,11 +364,7 @@ export async function runPreparedReply(
|
|||||||
}
|
}
|
||||||
const sessionIdFinal = sessionId ?? crypto.randomUUID();
|
const sessionIdFinal = sessionId ?? crypto.randomUUID();
|
||||||
const sessionFile = resolveSessionFilePath(sessionIdFinal, sessionEntry);
|
const sessionFile = resolveSessionFilePath(sessionIdFinal, sessionEntry);
|
||||||
const queueBodyBase = transcribedText
|
const queueBodyBase = [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
|
||||||
? [threadStarterNote, baseBodyFinal, `Transcript:\n${transcribedText}`]
|
|
||||||
.filter(Boolean)
|
|
||||||
.join("\n\n")
|
|
||||||
: [threadStarterNote, baseBodyFinal].filter(Boolean).join("\n\n");
|
|
||||||
const queuedBody = mediaNote
|
const queuedBody = mediaNote
|
||||||
? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim()
|
? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim()
|
||||||
: queueBodyBase;
|
: queueBodyBase;
|
||||||
|
|||||||
@@ -7,12 +7,11 @@ import { resolveModelRefFromString } from "../../agents/model-selection.js";
|
|||||||
import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
|
import { resolveAgentTimeoutMs } from "../../agents/timeout.js";
|
||||||
import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js";
|
import { DEFAULT_AGENT_WORKSPACE_DIR, ensureAgentWorkspace } from "../../agents/workspace.js";
|
||||||
import { type ClawdbotConfig, loadConfig } from "../../config/config.js";
|
import { type ClawdbotConfig, loadConfig } from "../../config/config.js";
|
||||||
import { logVerbose } from "../../globals.js";
|
|
||||||
import { defaultRuntime } from "../../runtime.js";
|
import { defaultRuntime } from "../../runtime.js";
|
||||||
import { resolveCommandAuthorization } from "../command-auth.js";
|
import { resolveCommandAuthorization } from "../command-auth.js";
|
||||||
import type { MsgContext } from "../templating.js";
|
import type { MsgContext } from "../templating.js";
|
||||||
import { SILENT_REPLY_TOKEN } from "../tokens.js";
|
import { SILENT_REPLY_TOKEN } from "../tokens.js";
|
||||||
import { hasAudioTranscriptionConfig, isAudio, transcribeInboundAudio } from "../transcription.js";
|
import { applyMediaUnderstanding } from "../../media-understanding/apply.js";
|
||||||
import type { GetReplyOptions, ReplyPayload } from "../types.js";
|
import type { GetReplyOptions, ReplyPayload } from "../types.js";
|
||||||
import { resolveDefaultModel } from "./directive-handling.js";
|
import { resolveDefaultModel } from "./directive-handling.js";
|
||||||
import { resolveReplyDirectives } from "./get-reply-directives.js";
|
import { resolveReplyDirectives } from "./get-reply-directives.js";
|
||||||
@@ -75,16 +74,11 @@ export async function getReplyFromConfig(
|
|||||||
});
|
});
|
||||||
opts?.onTypingController?.(typing);
|
opts?.onTypingController?.(typing);
|
||||||
|
|
||||||
let transcribedText: string | undefined;
|
await applyMediaUnderstanding({
|
||||||
if (hasAudioTranscriptionConfig(cfg) && isAudio(ctx.MediaType)) {
|
ctx,
|
||||||
const transcribed = await transcribeInboundAudio(cfg, ctx, defaultRuntime);
|
cfg,
|
||||||
if (transcribed?.text) {
|
agentDir,
|
||||||
transcribedText = transcribed.text;
|
});
|
||||||
ctx.Body = transcribed.text;
|
|
||||||
ctx.Transcript = transcribed.text;
|
|
||||||
logVerbose("Replaced Body with audio transcript for reply flow");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const commandAuthorized = ctx.CommandAuthorized ?? true;
|
const commandAuthorized = ctx.CommandAuthorized ?? true;
|
||||||
resolveCommandAuthorization({
|
resolveCommandAuthorization({
|
||||||
@@ -253,7 +247,6 @@ export async function getReplyFromConfig(
|
|||||||
model,
|
model,
|
||||||
perMessageQueueMode,
|
perMessageQueueMode,
|
||||||
perMessageQueueOptions,
|
perMessageQueueOptions,
|
||||||
transcribedText,
|
|
||||||
typing,
|
typing,
|
||||||
opts,
|
opts,
|
||||||
defaultModel,
|
defaultModel,
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import type { ChannelId } from "../channels/plugins/types.js";
|
import type { ChannelId } from "../channels/plugins/types.js";
|
||||||
import type { InternalMessageChannel } from "../utils/message-channel.js";
|
import type { InternalMessageChannel } from "../utils/message-channel.js";
|
||||||
import type { CommandArgs } from "./commands-registry.types.js";
|
import type { CommandArgs } from "./commands-registry.types.js";
|
||||||
|
import type { MediaUnderstandingOutput } from "../media-understanding/types.js";
|
||||||
|
|
||||||
/** Valid message channels for routing. */
|
/** Valid message channels for routing. */
|
||||||
export type OriginatingChannelType = ChannelId | InternalMessageChannel;
|
export type OriginatingChannelType = ChannelId | InternalMessageChannel;
|
||||||
@@ -41,6 +42,9 @@ export type MsgContext = {
|
|||||||
/** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */
|
/** Remote host for SCP when media lives on a different machine (e.g., clawdbot@192.168.64.3). */
|
||||||
MediaRemoteHost?: string;
|
MediaRemoteHost?: string;
|
||||||
Transcript?: string;
|
Transcript?: string;
|
||||||
|
MediaUnderstanding?: MediaUnderstandingOutput[];
|
||||||
|
Prompt?: string;
|
||||||
|
MaxChars?: number;
|
||||||
ChatType?: string;
|
ChatType?: string;
|
||||||
GroupSubject?: string;
|
GroupSubject?: string;
|
||||||
GroupRoom?: string;
|
GroupRoom?: string;
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ describe("legacy config detection", () => {
|
|||||||
expect(res.config?.messages?.groupChat?.mentionPatterns).toEqual(["@clawd"]);
|
expect(res.config?.messages?.groupChat?.mentionPatterns).toEqual(["@clawd"]);
|
||||||
expect(res.config?.routing?.groupChat?.mentionPatterns).toBeUndefined();
|
expect(res.config?.routing?.groupChat?.mentionPatterns).toBeUndefined();
|
||||||
});
|
});
|
||||||
it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/audio", async () => {
|
it("migrates routing agentToAgent/queue/transcribeAudio to tools/messages/media", async () => {
|
||||||
vi.resetModules();
|
vi.resetModules();
|
||||||
const { migrateLegacyConfig } = await import("./config.js");
|
const { migrateLegacyConfig } = await import("./config.js");
|
||||||
const res = migrateLegacyConfig({
|
const res = migrateLegacyConfig({
|
||||||
@@ -80,7 +80,7 @@ describe("legacy config detection", () => {
|
|||||||
});
|
});
|
||||||
expect(res.changes).toContain("Moved routing.agentToAgent → tools.agentToAgent.");
|
expect(res.changes).toContain("Moved routing.agentToAgent → tools.agentToAgent.");
|
||||||
expect(res.changes).toContain("Moved routing.queue → messages.queue.");
|
expect(res.changes).toContain("Moved routing.queue → messages.queue.");
|
||||||
expect(res.changes).toContain("Moved routing.transcribeAudio → tools.audio.transcription.");
|
expect(res.changes).toContain("Moved routing.transcribeAudio → tools.media.audio.models.");
|
||||||
expect(res.config?.tools?.agentToAgent).toEqual({
|
expect(res.config?.tools?.agentToAgent).toEqual({
|
||||||
enabled: true,
|
enabled: true,
|
||||||
allow: ["main"],
|
allow: ["main"],
|
||||||
@@ -89,9 +89,16 @@ describe("legacy config detection", () => {
|
|||||||
mode: "queue",
|
mode: "queue",
|
||||||
cap: 3,
|
cap: 3,
|
||||||
});
|
});
|
||||||
expect(res.config?.tools?.audio?.transcription).toEqual({
|
expect(res.config?.tools?.media?.audio).toEqual({
|
||||||
args: ["--model", "base"],
|
enabled: true,
|
||||||
timeoutSeconds: 2,
|
models: [
|
||||||
|
{
|
||||||
|
command: "whisper",
|
||||||
|
type: "cli",
|
||||||
|
args: ["--model", "base"],
|
||||||
|
timeoutSeconds: 2,
|
||||||
|
},
|
||||||
|
],
|
||||||
});
|
});
|
||||||
expect(res.config?.routing).toBeUndefined();
|
expect(res.config?.routing).toBeUndefined();
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -330,14 +330,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [
|
|||||||
const mapped = mapLegacyAudioTranscription(routing.transcribeAudio);
|
const mapped = mapLegacyAudioTranscription(routing.transcribeAudio);
|
||||||
if (mapped) {
|
if (mapped) {
|
||||||
const tools = ensureRecord(raw, "tools");
|
const tools = ensureRecord(raw, "tools");
|
||||||
const toolsAudio = ensureRecord(tools, "audio");
|
const media = ensureRecord(tools, "media");
|
||||||
if (toolsAudio.transcription === undefined) {
|
const mediaAudio = ensureRecord(media, "audio");
|
||||||
toolsAudio.transcription = mapped;
|
const models = Array.isArray(mediaAudio.models)
|
||||||
changes.push("Moved routing.transcribeAudio → tools.audio.transcription.");
|
? (mediaAudio.models as unknown[])
|
||||||
|
: [];
|
||||||
|
if (models.length === 0) {
|
||||||
|
mediaAudio.enabled = true;
|
||||||
|
mediaAudio.models = [mapped];
|
||||||
|
changes.push("Moved routing.transcribeAudio → tools.media.audio.models.");
|
||||||
} else {
|
} else {
|
||||||
changes.push(
|
changes.push("Removed routing.transcribeAudio (tools.media.audio.models already set).");
|
||||||
"Removed routing.transcribeAudio (tools.audio.transcription already set).",
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
changes.push("Removed routing.transcribeAudio (unsupported transcription CLI).");
|
changes.push("Removed routing.transcribeAudio (unsupported transcription CLI).");
|
||||||
@@ -350,12 +353,17 @@ export const LEGACY_CONFIG_MIGRATIONS_PART_2: LegacyConfigMigration[] = [
|
|||||||
const mapped = mapLegacyAudioTranscription(audio.transcription);
|
const mapped = mapLegacyAudioTranscription(audio.transcription);
|
||||||
if (mapped) {
|
if (mapped) {
|
||||||
const tools = ensureRecord(raw, "tools");
|
const tools = ensureRecord(raw, "tools");
|
||||||
const toolsAudio = ensureRecord(tools, "audio");
|
const media = ensureRecord(tools, "media");
|
||||||
if (toolsAudio.transcription === undefined) {
|
const mediaAudio = ensureRecord(media, "audio");
|
||||||
toolsAudio.transcription = mapped;
|
const models = Array.isArray(mediaAudio.models)
|
||||||
changes.push("Moved audio.transcription → tools.audio.transcription.");
|
? (mediaAudio.models as unknown[])
|
||||||
|
: [];
|
||||||
|
if (models.length === 0) {
|
||||||
|
mediaAudio.enabled = true;
|
||||||
|
mediaAudio.models = [mapped];
|
||||||
|
changes.push("Moved audio.transcription → tools.media.audio.models.");
|
||||||
} else {
|
} else {
|
||||||
changes.push("Removed audio.transcription (tools.audio.transcription already set).");
|
changes.push("Removed audio.transcription (tools.media.audio.models already set).");
|
||||||
}
|
}
|
||||||
delete audio.transcription;
|
delete audio.transcription;
|
||||||
if (Object.keys(audio).length === 0) delete raw.audio;
|
if (Object.keys(audio).length === 0) delete raw.audio;
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ export const LEGACY_CONFIG_RULES: LegacyConfigRule[] = [
|
|||||||
{
|
{
|
||||||
path: ["routing", "transcribeAudio"],
|
path: ["routing", "transcribeAudio"],
|
||||||
message:
|
message:
|
||||||
"routing.transcribeAudio was moved; use tools.audio.transcription instead (auto-migrated on load).",
|
"routing.transcribeAudio was moved; use tools.media.audio.models instead (auto-migrated on load).",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
path: ["telegram", "requireMention"],
|
path: ["telegram", "requireMention"],
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ export const mapLegacyAudioTranscription = (value: unknown): Record<string, unkn
|
|||||||
const timeoutSeconds =
|
const timeoutSeconds =
|
||||||
typeof transcriber?.timeoutSeconds === "number" ? transcriber?.timeoutSeconds : undefined;
|
typeof transcriber?.timeoutSeconds === "number" ? transcriber?.timeoutSeconds : undefined;
|
||||||
|
|
||||||
const result: Record<string, unknown> = {};
|
const result: Record<string, unknown> = { command: rawExecutable, type: "cli" };
|
||||||
if (args.length > 0) result.args = args;
|
if (args.length > 0) result.args = args;
|
||||||
if (timeoutSeconds !== undefined) result.timeoutSeconds = timeoutSeconds;
|
if (timeoutSeconds !== undefined) result.timeoutSeconds = timeoutSeconds;
|
||||||
return result;
|
return result;
|
||||||
|
|||||||
@@ -102,8 +102,28 @@ const FIELD_LABELS: Record<string, string> = {
|
|||||||
"gateway.remote.password": "Remote Gateway Password",
|
"gateway.remote.password": "Remote Gateway Password",
|
||||||
"gateway.auth.token": "Gateway Token",
|
"gateway.auth.token": "Gateway Token",
|
||||||
"gateway.auth.password": "Gateway Password",
|
"gateway.auth.password": "Gateway Password",
|
||||||
"tools.audio.transcription.args": "Audio Transcription Args",
|
"tools.media.image.enabled": "Enable Image Understanding",
|
||||||
"tools.audio.transcription.timeoutSeconds": "Audio Transcription Timeout (sec)",
|
"tools.media.image.maxBytes": "Image Understanding Max Bytes",
|
||||||
|
"tools.media.image.maxChars": "Image Understanding Max Chars",
|
||||||
|
"tools.media.image.prompt": "Image Understanding Prompt",
|
||||||
|
"tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)",
|
||||||
|
"tools.media.image.models": "Image Understanding Models",
|
||||||
|
"tools.media.image.scope": "Image Understanding Scope",
|
||||||
|
"tools.media.audio.enabled": "Enable Audio Understanding",
|
||||||
|
"tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
|
||||||
|
"tools.media.audio.maxChars": "Audio Understanding Max Chars",
|
||||||
|
"tools.media.audio.prompt": "Audio Understanding Prompt",
|
||||||
|
"tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
|
||||||
|
"tools.media.audio.language": "Audio Understanding Language",
|
||||||
|
"tools.media.audio.models": "Audio Understanding Models",
|
||||||
|
"tools.media.audio.scope": "Audio Understanding Scope",
|
||||||
|
"tools.media.video.enabled": "Enable Video Understanding",
|
||||||
|
"tools.media.video.maxBytes": "Video Understanding Max Bytes",
|
||||||
|
"tools.media.video.maxChars": "Video Understanding Max Chars",
|
||||||
|
"tools.media.video.prompt": "Video Understanding Prompt",
|
||||||
|
"tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)",
|
||||||
|
"tools.media.video.models": "Video Understanding Models",
|
||||||
|
"tools.media.video.scope": "Video Understanding Scope",
|
||||||
"tools.profile": "Tool Profile",
|
"tools.profile": "Tool Profile",
|
||||||
"agents.list[].tools.profile": "Agent Tool Profile",
|
"agents.list[].tools.profile": "Agent Tool Profile",
|
||||||
"tools.byProvider": "Tool Policy by Provider",
|
"tools.byProvider": "Tool Policy by Provider",
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ export type BroadcastConfig = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export type AudioConfig = {
|
export type AudioConfig = {
|
||||||
/** @deprecated Use tools.audio.transcription instead. */
|
/** @deprecated Use tools.media.audio.models instead. */
|
||||||
transcription?: {
|
transcription?: {
|
||||||
// Optional CLI to turn inbound audio into text; templated args, must output transcript to stdout.
|
// Optional CLI to turn inbound audio into text; templated args, must output transcript to stdout.
|
||||||
command: string[];
|
command: string[];
|
||||||
|
|||||||
@@ -1,4 +1,76 @@
|
|||||||
import type { AgentElevatedAllowFromConfig } from "./types.base.js";
|
import type { AgentElevatedAllowFromConfig, SessionSendPolicyAction } from "./types.base.js";
|
||||||
|
|
||||||
|
export type MediaUnderstandingScopeMatch = {
|
||||||
|
channel?: string;
|
||||||
|
chatType?: "direct" | "group" | "room";
|
||||||
|
keyPrefix?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type MediaUnderstandingScopeRule = {
|
||||||
|
action: SessionSendPolicyAction;
|
||||||
|
match?: MediaUnderstandingScopeMatch;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type MediaUnderstandingScopeConfig = {
|
||||||
|
default?: SessionSendPolicyAction;
|
||||||
|
rules?: MediaUnderstandingScopeRule[];
|
||||||
|
};
|
||||||
|
|
||||||
|
export type MediaUnderstandingCapability = "image" | "audio" | "video";
|
||||||
|
|
||||||
|
export type MediaUnderstandingModelConfig = {
|
||||||
|
/** provider API id (e.g. openai, google). */
|
||||||
|
provider?: string;
|
||||||
|
/** Model id for provider-based understanding. */
|
||||||
|
model?: string;
|
||||||
|
/** Optional capability tags for shared model lists. */
|
||||||
|
capabilities?: MediaUnderstandingCapability[];
|
||||||
|
/** Use a CLI command instead of provider API. */
|
||||||
|
type?: "provider" | "cli";
|
||||||
|
/** CLI binary (required when type=cli). */
|
||||||
|
command?: string;
|
||||||
|
/** CLI args (template-enabled). */
|
||||||
|
args?: string[];
|
||||||
|
/** Optional prompt override for this model entry. */
|
||||||
|
prompt?: string;
|
||||||
|
/** Optional max output characters for this model entry. */
|
||||||
|
maxChars?: number;
|
||||||
|
/** Optional max bytes for this model entry. */
|
||||||
|
maxBytes?: number;
|
||||||
|
/** Optional timeout override (seconds) for this model entry. */
|
||||||
|
timeoutSeconds?: number;
|
||||||
|
/** Optional language hint for audio transcription. */
|
||||||
|
language?: string;
|
||||||
|
/** Auth profile id to use for this provider. */
|
||||||
|
profile?: string;
|
||||||
|
/** Preferred profile id if multiple are available. */
|
||||||
|
preferredProfile?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type MediaUnderstandingConfig = {
|
||||||
|
/** Enable media understanding when models are configured. */
|
||||||
|
enabled?: boolean;
|
||||||
|
/** Optional scope gating for understanding. */
|
||||||
|
scope?: MediaUnderstandingScopeConfig;
|
||||||
|
/** Default max bytes to send. */
|
||||||
|
maxBytes?: number;
|
||||||
|
/** Default max output characters. */
|
||||||
|
maxChars?: number;
|
||||||
|
/** Default prompt. */
|
||||||
|
prompt?: string;
|
||||||
|
/** Default timeout (seconds). */
|
||||||
|
timeoutSeconds?: number;
|
||||||
|
/** Default language hint (audio). */
|
||||||
|
language?: string;
|
||||||
|
/** Ordered model list (fallbacks in order). */
|
||||||
|
models?: MediaUnderstandingModelConfig[];
|
||||||
|
};
|
||||||
|
|
||||||
|
export type MediaToolsConfig = {
|
||||||
|
image?: MediaUnderstandingConfig;
|
||||||
|
audio?: MediaUnderstandingConfig;
|
||||||
|
video?: MediaUnderstandingConfig;
|
||||||
|
};
|
||||||
|
|
||||||
export type ToolProfileId = "minimal" | "coding" | "messaging" | "full";
|
export type ToolProfileId = "minimal" | "coding" | "messaging" | "full";
|
||||||
|
|
||||||
@@ -127,13 +199,7 @@ export type ToolsConfig = {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
audio?: {
|
media?: MediaToolsConfig;
|
||||||
transcription?: {
|
|
||||||
/** CLI args (template-enabled). */
|
|
||||||
args?: string[];
|
|
||||||
timeoutSeconds?: number;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
/** Message tool configuration. */
|
/** Message tool configuration. */
|
||||||
message?: {
|
message?: {
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import {
|
|||||||
GroupChatSchema,
|
GroupChatSchema,
|
||||||
HumanDelaySchema,
|
HumanDelaySchema,
|
||||||
IdentitySchema,
|
IdentitySchema,
|
||||||
ToolsAudioTranscriptionSchema,
|
ToolsMediaSchema,
|
||||||
} from "./zod-schema.core.js";
|
} from "./zod-schema.core.js";
|
||||||
|
|
||||||
export const HeartbeatSchema = z
|
export const HeartbeatSchema = z
|
||||||
@@ -283,11 +283,7 @@ export const ToolsSchema = z
|
|||||||
deny: z.array(z.string()).optional(),
|
deny: z.array(z.string()).optional(),
|
||||||
byProvider: z.record(z.string(), ToolPolicyWithProfileSchema).optional(),
|
byProvider: z.record(z.string(), ToolPolicyWithProfileSchema).optional(),
|
||||||
web: ToolsWebSchema,
|
web: ToolsWebSchema,
|
||||||
audio: z
|
media: ToolsMediaSchema,
|
||||||
.object({
|
|
||||||
transcription: ToolsAudioTranscriptionSchema,
|
|
||||||
})
|
|
||||||
.optional(),
|
|
||||||
message: z
|
message: z
|
||||||
.object({
|
.object({
|
||||||
allowCrossContextSend: z.boolean().optional(),
|
allowCrossContextSend: z.boolean().optional(),
|
||||||
|
|||||||
@@ -240,10 +240,68 @@ export const ExecutableTokenSchema = z
|
|||||||
.string()
|
.string()
|
||||||
.refine(isSafeExecutableValue, "expected safe executable name or path");
|
.refine(isSafeExecutableValue, "expected safe executable name or path");
|
||||||
|
|
||||||
export const ToolsAudioTranscriptionSchema = z
|
export const MediaUnderstandingScopeSchema = z
|
||||||
.object({
|
.object({
|
||||||
|
default: z.union([z.literal("allow"), z.literal("deny")]).optional(),
|
||||||
|
rules: z
|
||||||
|
.array(
|
||||||
|
z.object({
|
||||||
|
action: z.union([z.literal("allow"), z.literal("deny")]),
|
||||||
|
match: z
|
||||||
|
.object({
|
||||||
|
channel: z.string().optional(),
|
||||||
|
chatType: z
|
||||||
|
.union([z.literal("direct"), z.literal("group"), z.literal("room")])
|
||||||
|
.optional(),
|
||||||
|
keyPrefix: z.string().optional(),
|
||||||
|
})
|
||||||
|
.optional(),
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.optional(),
|
||||||
|
})
|
||||||
|
.optional();
|
||||||
|
|
||||||
|
export const MediaUnderstandingCapabilitiesSchema = z
|
||||||
|
.array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")]))
|
||||||
|
.optional();
|
||||||
|
|
||||||
|
export const MediaUnderstandingModelSchema = z
|
||||||
|
.object({
|
||||||
|
provider: z.string().optional(),
|
||||||
|
model: z.string().optional(),
|
||||||
|
capabilities: MediaUnderstandingCapabilitiesSchema,
|
||||||
|
type: z.union([z.literal("provider"), z.literal("cli")]).optional(),
|
||||||
|
command: z.string().optional(),
|
||||||
args: z.array(z.string()).optional(),
|
args: z.array(z.string()).optional(),
|
||||||
|
prompt: z.string().optional(),
|
||||||
|
maxChars: z.number().int().positive().optional(),
|
||||||
|
maxBytes: z.number().int().positive().optional(),
|
||||||
timeoutSeconds: z.number().int().positive().optional(),
|
timeoutSeconds: z.number().int().positive().optional(),
|
||||||
|
language: z.string().optional(),
|
||||||
|
profile: z.string().optional(),
|
||||||
|
preferredProfile: z.string().optional(),
|
||||||
|
})
|
||||||
|
.optional();
|
||||||
|
|
||||||
|
export const ToolsMediaUnderstandingSchema = z
|
||||||
|
.object({
|
||||||
|
enabled: z.boolean().optional(),
|
||||||
|
scope: MediaUnderstandingScopeSchema,
|
||||||
|
maxBytes: z.number().int().positive().optional(),
|
||||||
|
maxChars: z.number().int().positive().optional(),
|
||||||
|
prompt: z.string().optional(),
|
||||||
|
timeoutSeconds: z.number().int().positive().optional(),
|
||||||
|
language: z.string().optional(),
|
||||||
|
models: z.array(MediaUnderstandingModelSchema).optional(),
|
||||||
|
})
|
||||||
|
.optional();
|
||||||
|
|
||||||
|
export const ToolsMediaSchema = z
|
||||||
|
.object({
|
||||||
|
image: ToolsMediaUnderstandingSchema.optional(),
|
||||||
|
audio: ToolsMediaUnderstandingSchema.optional(),
|
||||||
|
video: ToolsMediaUnderstandingSchema.optional(),
|
||||||
})
|
})
|
||||||
.optional();
|
.optional();
|
||||||
|
|
||||||
|
|||||||
258
src/media-understanding/apply.test.ts
Normal file
258
src/media-understanding/apply.test.ts
Normal file
@@ -0,0 +1,258 @@
|
|||||||
|
import fs from "node:fs/promises";
|
||||||
|
import os from "node:os";
|
||||||
|
import path from "node:path";
|
||||||
|
|
||||||
|
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||||
|
|
||||||
|
import type { ClawdbotConfig } from "../config/config.js";
|
||||||
|
import type { MsgContext } from "../auto-reply/templating.js";
|
||||||
|
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
|
||||||
|
import { fetchRemoteMedia } from "../media/fetch.js";
|
||||||
|
|
||||||
|
vi.mock("../agents/model-auth.js", () => ({
|
||||||
|
resolveApiKeyForProvider: vi.fn(async () => ({
|
||||||
|
apiKey: "test-key",
|
||||||
|
source: "test",
|
||||||
|
})),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../media/fetch.js", () => ({
|
||||||
|
fetchRemoteMedia: vi.fn(),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../process/exec.js", () => ({
|
||||||
|
runExec: vi.fn(),
|
||||||
|
}));
|
||||||
|
|
||||||
|
async function loadApply() {
|
||||||
|
return await import("./apply.js");
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("applyMediaUnderstanding", () => {
|
||||||
|
const mockedResolveApiKey = vi.mocked(resolveApiKeyForProvider);
|
||||||
|
const mockedFetchRemoteMedia = vi.mocked(fetchRemoteMedia);
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
mockedResolveApiKey.mockClear();
|
||||||
|
mockedFetchRemoteMedia.mockReset();
|
||||||
|
mockedFetchRemoteMedia.mockResolvedValue({
|
||||||
|
buffer: Buffer.from("audio-bytes"),
|
||||||
|
contentType: "audio/ogg",
|
||||||
|
fileName: "note.ogg",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("sets Transcript and replaces Body when audio transcription succeeds", async () => {
|
||||||
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||||
|
const audioPath = path.join(dir, "note.ogg");
|
||||||
|
await fs.writeFile(audioPath, "hello");
|
||||||
|
|
||||||
|
const ctx: MsgContext = {
|
||||||
|
Body: "<media:audio>",
|
||||||
|
MediaPath: audioPath,
|
||||||
|
MediaType: "audio/ogg",
|
||||||
|
};
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
maxBytes: 1024 * 1024,
|
||||||
|
models: [{ provider: "groq" }],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await applyMediaUnderstanding({
|
||||||
|
ctx,
|
||||||
|
cfg,
|
||||||
|
providers: {
|
||||||
|
groq: {
|
||||||
|
id: "groq",
|
||||||
|
transcribeAudio: async () => ({ text: "transcribed text" }),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.appliedAudio).toBe(true);
|
||||||
|
expect(ctx.Transcript).toBe("transcribed text");
|
||||||
|
expect(ctx.Body).toBe("[Audio]\nTranscript:\ntranscribed text");
|
||||||
|
expect(ctx.CommandBody).toBe("transcribed text");
|
||||||
|
expect(ctx.RawBody).toBe("transcribed text");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("handles URL-only attachments for audio transcription", async () => {
|
||||||
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
|
const ctx: MsgContext = {
|
||||||
|
Body: "<media:audio>",
|
||||||
|
MediaUrl: "https://example.com/note.ogg",
|
||||||
|
MediaType: "audio/ogg",
|
||||||
|
ChatType: "dm",
|
||||||
|
};
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
maxBytes: 1024 * 1024,
|
||||||
|
scope: {
|
||||||
|
default: "deny",
|
||||||
|
rules: [{ action: "allow", match: { chatType: "direct" } }],
|
||||||
|
},
|
||||||
|
models: [{ provider: "groq" }],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await applyMediaUnderstanding({
|
||||||
|
ctx,
|
||||||
|
cfg,
|
||||||
|
providers: {
|
||||||
|
groq: {
|
||||||
|
id: "groq",
|
||||||
|
transcribeAudio: async () => ({ text: "remote transcript" }),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.appliedAudio).toBe(true);
|
||||||
|
expect(ctx.Transcript).toBe("remote transcript");
|
||||||
|
expect(ctx.Body).toBe("[Audio]\nTranscript:\nremote transcript");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("skips audio transcription when attachment exceeds maxBytes", async () => {
|
||||||
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||||
|
const audioPath = path.join(dir, "large.wav");
|
||||||
|
await fs.writeFile(audioPath, "0123456789");
|
||||||
|
|
||||||
|
const ctx: MsgContext = {
|
||||||
|
Body: "<media:audio>",
|
||||||
|
MediaPath: audioPath,
|
||||||
|
MediaType: "audio/wav",
|
||||||
|
};
|
||||||
|
const transcribeAudio = vi.fn(async () => ({ text: "should-not-run" }));
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
maxBytes: 4,
|
||||||
|
models: [{ provider: "groq" }],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await applyMediaUnderstanding({
|
||||||
|
ctx,
|
||||||
|
cfg,
|
||||||
|
providers: { groq: { id: "groq", transcribeAudio } },
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.appliedAudio).toBe(false);
|
||||||
|
expect(transcribeAudio).not.toHaveBeenCalled();
|
||||||
|
expect(ctx.Body).toBe("<media:audio>");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("falls back to CLI model when provider fails", async () => {
|
||||||
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||||
|
const audioPath = path.join(dir, "note.ogg");
|
||||||
|
await fs.writeFile(audioPath, "hello");
|
||||||
|
|
||||||
|
const ctx: MsgContext = {
|
||||||
|
Body: "<media:audio>",
|
||||||
|
MediaPath: audioPath,
|
||||||
|
MediaType: "audio/ogg",
|
||||||
|
};
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
models: [
|
||||||
|
{ provider: "groq" },
|
||||||
|
{
|
||||||
|
type: "cli",
|
||||||
|
command: "whisper",
|
||||||
|
args: ["{{MediaPath}}"],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const execModule = await import("../process/exec.js");
|
||||||
|
vi.mocked(execModule.runExec).mockResolvedValue({
|
||||||
|
stdout: "cli transcript\n",
|
||||||
|
stderr: "",
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await applyMediaUnderstanding({
|
||||||
|
ctx,
|
||||||
|
cfg,
|
||||||
|
providers: {
|
||||||
|
groq: {
|
||||||
|
id: "groq",
|
||||||
|
transcribeAudio: async () => {
|
||||||
|
throw new Error("boom");
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.appliedAudio).toBe(true);
|
||||||
|
expect(ctx.Transcript).toBe("cli transcript");
|
||||||
|
expect(ctx.Body).toBe("[Audio]\nTranscript:\ncli transcript");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("uses CLI image understanding and preserves caption for commands", async () => {
|
||||||
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||||
|
const imagePath = path.join(dir, "photo.jpg");
|
||||||
|
await fs.writeFile(imagePath, "image-bytes");
|
||||||
|
|
||||||
|
const ctx: MsgContext = {
|
||||||
|
Body: "<media:image> show Dom",
|
||||||
|
MediaPath: imagePath,
|
||||||
|
MediaType: "image/jpeg",
|
||||||
|
};
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
image: {
|
||||||
|
enabled: true,
|
||||||
|
models: [
|
||||||
|
{
|
||||||
|
type: "cli",
|
||||||
|
command: "gemini",
|
||||||
|
args: ["--file", "{{MediaPath}}", "--prompt", "{{Prompt}}"],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const execModule = await import("../process/exec.js");
|
||||||
|
vi.mocked(execModule.runExec).mockResolvedValue({
|
||||||
|
stdout: "image description\n",
|
||||||
|
stderr: "",
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await applyMediaUnderstanding({
|
||||||
|
ctx,
|
||||||
|
cfg,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.appliedImage).toBe(true);
|
||||||
|
expect(ctx.Body).toBe("[Image]\nUser text:\nshow Dom\nDescription:\nimage description");
|
||||||
|
expect(ctx.CommandBody).toBe("show Dom");
|
||||||
|
expect(ctx.RawBody).toBe("show Dom");
|
||||||
|
});
|
||||||
|
});
|
||||||
804
src/media-understanding/apply.ts
Normal file
804
src/media-understanding/apply.ts
Normal file
@@ -0,0 +1,804 @@
|
|||||||
|
import crypto from "node:crypto";
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import os from "node:os";
|
||||||
|
import path from "node:path";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
|
||||||
|
import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
|
||||||
|
import { complete } from "@mariozechner/pi-ai";
|
||||||
|
import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
|
||||||
|
|
||||||
|
import type { ClawdbotConfig } from "../config/config.js";
|
||||||
|
import type { MsgContext } from "../auto-reply/templating.js";
|
||||||
|
import { applyTemplate } from "../auto-reply/templating.js";
|
||||||
|
import { getApiKeyForModel, resolveApiKeyForProvider } from "../agents/model-auth.js";
|
||||||
|
import { ensureClawdbotModelsJson } from "../agents/models-config.js";
|
||||||
|
import { minimaxUnderstandImage } from "../agents/minimax-vlm.js";
|
||||||
|
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
||||||
|
import { fetchRemoteMedia } from "../media/fetch.js";
|
||||||
|
import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js";
|
||||||
|
import { runExec } from "../process/exec.js";
|
||||||
|
import type {
|
||||||
|
MediaUnderstandingConfig,
|
||||||
|
MediaUnderstandingModelConfig,
|
||||||
|
MediaUnderstandingScopeConfig,
|
||||||
|
} from "../config/types.tools.js";
|
||||||
|
import { extractMediaUserText, formatMediaUnderstandingBody } from "./format.js";
|
||||||
|
import {
|
||||||
|
buildMediaUnderstandingRegistry,
|
||||||
|
getMediaUnderstandingProvider,
|
||||||
|
normalizeMediaProviderId,
|
||||||
|
} from "./providers/index.js";
|
||||||
|
import { fetchWithTimeout } from "./providers/shared.js";
|
||||||
|
import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js";
|
||||||
|
import type {
|
||||||
|
MediaAttachment,
|
||||||
|
MediaUnderstandingOutput,
|
||||||
|
MediaUnderstandingProvider,
|
||||||
|
} from "./types.js";
|
||||||
|
import { coerceImageAssistantText } from "../agents/tools/image-tool.helpers.js";
|
||||||
|
|
||||||
|
const MB = 1024 * 1024;
|
||||||
|
const DEFAULT_MAX_CHARS = 500;
|
||||||
|
const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<Capability, number | undefined> = {
|
||||||
|
image: DEFAULT_MAX_CHARS,
|
||||||
|
audio: undefined,
|
||||||
|
video: DEFAULT_MAX_CHARS,
|
||||||
|
};
|
||||||
|
const DEFAULT_MAX_BYTES: Record<Capability, number> = {
|
||||||
|
image: 10 * MB,
|
||||||
|
audio: 20 * MB,
|
||||||
|
video: 50 * MB,
|
||||||
|
};
|
||||||
|
const DEFAULT_TIMEOUT_SECONDS: Record<Capability, number> = {
|
||||||
|
image: 60,
|
||||||
|
audio: 60,
|
||||||
|
video: 120,
|
||||||
|
};
|
||||||
|
const DEFAULT_PROMPT: Record<Capability, string> = {
|
||||||
|
image: "Describe the image.",
|
||||||
|
audio: "Transcribe the audio.",
|
||||||
|
video: "Describe the video.",
|
||||||
|
};
|
||||||
|
const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
|
||||||
|
const DEFAULT_AUDIO_MODELS: Record<string, string> = {
|
||||||
|
groq: "whisper-large-v3-turbo",
|
||||||
|
openai: "whisper-1",
|
||||||
|
};
|
||||||
|
const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
|
||||||
|
|
||||||
|
export type ApplyMediaUnderstandingResult = {
|
||||||
|
outputs: MediaUnderstandingOutput[];
|
||||||
|
appliedImage: boolean;
|
||||||
|
appliedAudio: boolean;
|
||||||
|
appliedVideo: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
type Capability = "image" | "audio" | "video";
|
||||||
|
|
||||||
|
type MediaBufferResult = {
|
||||||
|
buffer: Buffer;
|
||||||
|
mime?: string;
|
||||||
|
fileName: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type MediaPathResult = {
|
||||||
|
path: string;
|
||||||
|
cleanup?: () => Promise<void> | void;
|
||||||
|
};
|
||||||
|
|
||||||
|
function normalizeAttachmentPath(raw?: string | null): string | undefined {
|
||||||
|
const value = raw?.trim();
|
||||||
|
if (!value) return undefined;
|
||||||
|
if (value.startsWith("file://")) {
|
||||||
|
try {
|
||||||
|
return fileURLToPath(value);
|
||||||
|
} catch {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
|
||||||
|
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
|
||||||
|
const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
|
||||||
|
const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
|
||||||
|
const resolveMime = (count: number, index: number) => {
|
||||||
|
const typeHint = typesFromArray?.[index];
|
||||||
|
const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
|
||||||
|
if (trimmed) return trimmed;
|
||||||
|
return count === 1 ? ctx.MediaType : undefined;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (pathsFromArray && pathsFromArray.length > 0) {
|
||||||
|
const count = pathsFromArray.length;
|
||||||
|
const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
|
||||||
|
return pathsFromArray
|
||||||
|
.map((value, index) => ({
|
||||||
|
path: value?.trim() || undefined,
|
||||||
|
url: urls?.[index] ?? ctx.MediaUrl,
|
||||||
|
mime: resolveMime(count, index),
|
||||||
|
index,
|
||||||
|
}))
|
||||||
|
.filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (urlsFromArray && urlsFromArray.length > 0) {
|
||||||
|
const count = urlsFromArray.length;
|
||||||
|
return urlsFromArray
|
||||||
|
.map((value, index) => ({
|
||||||
|
path: undefined,
|
||||||
|
url: value?.trim() || undefined,
|
||||||
|
mime: resolveMime(count, index),
|
||||||
|
index,
|
||||||
|
}))
|
||||||
|
.filter((entry) => Boolean(entry.url?.trim()));
|
||||||
|
}
|
||||||
|
|
||||||
|
const pathValue = ctx.MediaPath?.trim();
|
||||||
|
const url = ctx.MediaUrl?.trim();
|
||||||
|
if (!pathValue && !url) return [];
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
path: pathValue || undefined,
|
||||||
|
url: url || undefined,
|
||||||
|
mime: ctx.MediaType,
|
||||||
|
index: 0,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
function isVideoAttachment(attachment: MediaAttachment): boolean {
|
||||||
|
if (attachment.mime?.startsWith("video/")) return true;
|
||||||
|
const ext = getFileExtension(attachment.path ?? attachment.url);
|
||||||
|
if (!ext) return false;
|
||||||
|
return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isAudioAttachment(attachment: MediaAttachment): boolean {
|
||||||
|
if (attachment.mime?.startsWith("audio/")) return true;
|
||||||
|
return isAudioFileName(attachment.path ?? attachment.url);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isImageAttachment(attachment: MediaAttachment): boolean {
|
||||||
|
if (attachment.mime?.startsWith("image/")) return true;
|
||||||
|
const ext = getFileExtension(attachment.path ?? attachment.url);
|
||||||
|
if (!ext) return false;
|
||||||
|
return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext);
|
||||||
|
}
|
||||||
|
|
||||||
|
function estimateBase64Size(bytes: number): number {
|
||||||
|
return Math.ceil(bytes / 3) * 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveVideoMaxBase64Bytes(maxBytes: number): number {
|
||||||
|
const expanded = Math.floor(maxBytes * (4 / 3));
|
||||||
|
return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number {
|
||||||
|
const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds;
|
||||||
|
return Math.max(1000, Math.floor(value * 1000));
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolvePrompt(capability: Capability, prompt?: string, maxChars?: number): string {
|
||||||
|
const base = prompt?.trim() || DEFAULT_PROMPT[capability];
|
||||||
|
if (!maxChars || capability === "audio") return base;
|
||||||
|
return `${base} Respond in at most ${maxChars} characters.`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveRequestUrl(input: RequestInfo | URL): string {
|
||||||
|
if (typeof input === "string") return input;
|
||||||
|
if (input instanceof URL) return input.toString();
|
||||||
|
return input.url;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeErrorMessage(err: unknown): string {
|
||||||
|
if (!err) return "";
|
||||||
|
if (typeof err === "string") return err;
|
||||||
|
if (err instanceof Error) return err.message;
|
||||||
|
try {
|
||||||
|
return JSON.stringify(err);
|
||||||
|
} catch {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveMaxChars(params: {
|
||||||
|
capability: Capability;
|
||||||
|
entry: MediaUnderstandingModelConfig;
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
}): number | undefined {
|
||||||
|
const { capability, entry, cfg } = params;
|
||||||
|
const configured = entry.maxChars ?? cfg.tools?.media?.[capability]?.maxChars;
|
||||||
|
if (typeof configured === "number") return configured;
|
||||||
|
return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability];
|
||||||
|
}
|
||||||
|
|
||||||
|
function trimOutput(text: string, maxChars?: number): string {
|
||||||
|
const trimmed = text.trim();
|
||||||
|
if (!maxChars || trimmed.length <= maxChars) return trimmed;
|
||||||
|
return trimmed.slice(0, maxChars).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveConfigValue<T>(primary: T | undefined, fallback: T): T {
|
||||||
|
return primary === undefined ? fallback : primary;
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveCapabilityConfig(
|
||||||
|
cfg: ClawdbotConfig,
|
||||||
|
capability: Capability,
|
||||||
|
): MediaUnderstandingConfig | undefined {
|
||||||
|
return cfg.tools?.media?.[capability];
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveScopeDecision(params: {
|
||||||
|
scope?: MediaUnderstandingScopeConfig;
|
||||||
|
ctx: MsgContext;
|
||||||
|
}): "allow" | "deny" {
|
||||||
|
return resolveMediaUnderstandingScope({
|
||||||
|
scope: params.scope,
|
||||||
|
sessionKey: params.ctx.SessionKey,
|
||||||
|
channel: params.ctx.Surface ?? params.ctx.Provider,
|
||||||
|
chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveModelEntries(
|
||||||
|
cfg: MediaUnderstandingConfig | undefined,
|
||||||
|
capability: Capability,
|
||||||
|
): MediaUnderstandingModelConfig[] {
|
||||||
|
const models = cfg?.models ?? [];
|
||||||
|
if (models.length === 0) return [];
|
||||||
|
return models.filter((entry) => {
|
||||||
|
const caps = entry.capabilities;
|
||||||
|
if (!caps || caps.length === 0) return true;
|
||||||
|
return caps.includes(capability);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function isMaxBytesError(err: unknown): boolean {
|
||||||
|
const message = normalizeErrorMessage(err);
|
||||||
|
if (!message) return false;
|
||||||
|
return message.includes("exceeds maxBytes") || message.includes("payload exceeds maxBytes");
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadAttachmentBuffer(params: {
|
||||||
|
attachment: MediaAttachment;
|
||||||
|
maxBytes: number;
|
||||||
|
timeoutMs: number;
|
||||||
|
}): Promise<MediaBufferResult | undefined> {
|
||||||
|
const { attachment, maxBytes, timeoutMs } = params;
|
||||||
|
const rawPath = normalizeAttachmentPath(attachment.path);
|
||||||
|
if (rawPath) {
|
||||||
|
const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
|
||||||
|
try {
|
||||||
|
const stat = await fs.stat(resolved);
|
||||||
|
if (!stat.isFile()) return undefined;
|
||||||
|
if (stat.size > maxBytes) {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(
|
||||||
|
`Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const buffer = await fs.readFile(resolved);
|
||||||
|
const mime =
|
||||||
|
attachment.mime ??
|
||||||
|
(await detectMime({
|
||||||
|
buffer,
|
||||||
|
filePath: resolved,
|
||||||
|
}));
|
||||||
|
const fileName = path.basename(resolved) || `media-${attachment.index + 1}`;
|
||||||
|
return { buffer, mime, fileName };
|
||||||
|
} catch (err) {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const url = attachment.url?.trim();
|
||||||
|
if (!url) return undefined;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
|
||||||
|
fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
|
||||||
|
const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
|
||||||
|
if (fetched.buffer.length > maxBytes) {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(
|
||||||
|
`Skipping media attachment ${attachment.index + 1}: ${fetched.buffer.length} bytes exceeds ${maxBytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const mime =
|
||||||
|
attachment.mime ??
|
||||||
|
fetched.contentType ??
|
||||||
|
(await detectMime({
|
||||||
|
buffer: fetched.buffer,
|
||||||
|
filePath: fetched.fileName ?? url,
|
||||||
|
}));
|
||||||
|
const fileName = fetched.fileName ?? `media-${attachment.index + 1}`;
|
||||||
|
return { buffer: fetched.buffer, mime, fileName };
|
||||||
|
} catch (err) {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function resolveAttachmentPath(params: {
|
||||||
|
attachment: MediaAttachment;
|
||||||
|
maxBytes?: number;
|
||||||
|
timeoutMs: number;
|
||||||
|
}): Promise<MediaPathResult | undefined> {
|
||||||
|
const { attachment, maxBytes, timeoutMs } = params;
|
||||||
|
const rawPath = normalizeAttachmentPath(attachment.path);
|
||||||
|
if (rawPath) {
|
||||||
|
const resolved = path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
|
||||||
|
try {
|
||||||
|
const stat = await fs.stat(resolved);
|
||||||
|
if (!stat.isFile()) return undefined;
|
||||||
|
if (maxBytes && stat.size > maxBytes) {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(
|
||||||
|
`Skipping media attachment ${attachment.index + 1}: ${stat.size} bytes exceeds ${maxBytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return { path: resolved };
|
||||||
|
} catch (err) {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(`Failed to read attachment ${attachment.index + 1}: ${String(err)}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const url = attachment.url?.trim();
|
||||||
|
if (!url) return undefined;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
|
||||||
|
fetchWithTimeout(resolveRequestUrl(input), init ?? {}, timeoutMs, fetch);
|
||||||
|
const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes });
|
||||||
|
const buffer = fetched.buffer;
|
||||||
|
if (maxBytes && buffer.length > maxBytes) {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(
|
||||||
|
`Skipping media attachment ${attachment.index + 1}: ${buffer.length} bytes exceeds ${maxBytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const extension = fetched.fileName ? path.extname(fetched.fileName) : "";
|
||||||
|
const tmpPath = path.join(
|
||||||
|
os.tmpdir(),
|
||||||
|
`clawdbot-media-${crypto.randomUUID()}${extension || ""}`,
|
||||||
|
);
|
||||||
|
await fs.writeFile(tmpPath, buffer);
|
||||||
|
return {
|
||||||
|
path: tmpPath,
|
||||||
|
cleanup: async () => {
|
||||||
|
await fs.unlink(tmpPath).catch(() => {});
|
||||||
|
},
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(`Failed to fetch attachment ${attachment.index + 1}: ${String(err)}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function describeImageWithModel(params: {
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
agentDir: string;
|
||||||
|
provider: string;
|
||||||
|
model: string;
|
||||||
|
prompt: string;
|
||||||
|
maxChars?: number;
|
||||||
|
buffer: Buffer;
|
||||||
|
mimeType: string;
|
||||||
|
profile?: string;
|
||||||
|
preferredProfile?: string;
|
||||||
|
}): Promise<{ text: string; model: string }> {
|
||||||
|
await ensureClawdbotModelsJson(params.cfg, params.agentDir);
|
||||||
|
const authStorage = discoverAuthStorage(params.agentDir);
|
||||||
|
const modelRegistry = discoverModels(authStorage, params.agentDir);
|
||||||
|
const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
|
||||||
|
if (!model) {
|
||||||
|
throw new Error(`Unknown model: ${params.provider}/${params.model}`);
|
||||||
|
}
|
||||||
|
if (!model.input?.includes("image")) {
|
||||||
|
throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
|
||||||
|
}
|
||||||
|
const apiKeyInfo = await getApiKeyForModel({
|
||||||
|
model,
|
||||||
|
cfg: params.cfg,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
profileId: params.profile,
|
||||||
|
preferredProfile: params.preferredProfile,
|
||||||
|
});
|
||||||
|
authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
|
||||||
|
|
||||||
|
const base64 = params.buffer.toString("base64");
|
||||||
|
if (model.provider === "minimax") {
|
||||||
|
const text = await minimaxUnderstandImage({
|
||||||
|
apiKey: apiKeyInfo.apiKey,
|
||||||
|
prompt: params.prompt,
|
||||||
|
imageDataUrl: `data:${params.mimeType};base64,${base64}`,
|
||||||
|
modelBaseUrl: model.baseUrl,
|
||||||
|
});
|
||||||
|
return { text, model: model.id };
|
||||||
|
}
|
||||||
|
|
||||||
|
const context: Context = {
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: [
|
||||||
|
{ type: "text", text: params.prompt },
|
||||||
|
{ type: "image", data: base64, mimeType: params.mimeType },
|
||||||
|
],
|
||||||
|
timestamp: Date.now(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
const message = (await complete(model, context, {
|
||||||
|
apiKey: apiKeyInfo.apiKey,
|
||||||
|
maxTokens: 512,
|
||||||
|
})) as AssistantMessage;
|
||||||
|
const text = coerceImageAssistantText({
|
||||||
|
message,
|
||||||
|
provider: model.provider,
|
||||||
|
model: model.id,
|
||||||
|
});
|
||||||
|
return { text, model: model.id };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runProviderEntry(params: {
|
||||||
|
capability: Capability;
|
||||||
|
entry: MediaUnderstandingModelConfig;
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
ctx: MsgContext;
|
||||||
|
attachment: MediaAttachment;
|
||||||
|
agentDir?: string;
|
||||||
|
providerRegistry: Map<string, MediaUnderstandingProvider>;
|
||||||
|
}): Promise<MediaUnderstandingOutput | null> {
|
||||||
|
const { entry, capability, cfg, attachment } = params;
|
||||||
|
const providerIdRaw = entry.provider?.trim();
|
||||||
|
if (!providerIdRaw) {
|
||||||
|
throw new Error(`Provider entry missing provider for ${capability}`);
|
||||||
|
}
|
||||||
|
const providerId = normalizeMediaProviderId(providerIdRaw);
|
||||||
|
const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
|
||||||
|
const maxChars = resolveMaxChars({ capability, entry, cfg });
|
||||||
|
const timeoutMs = resolveTimeoutMs(
|
||||||
|
entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
|
||||||
|
DEFAULT_TIMEOUT_SECONDS[capability],
|
||||||
|
);
|
||||||
|
const prompt = resolvePrompt(
|
||||||
|
capability,
|
||||||
|
entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
|
||||||
|
maxChars,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (capability === "image") {
|
||||||
|
if (!params.agentDir) {
|
||||||
|
throw new Error("Image understanding requires agentDir");
|
||||||
|
}
|
||||||
|
const modelId = entry.model?.trim();
|
||||||
|
if (!modelId) {
|
||||||
|
throw new Error("Image understanding requires model id");
|
||||||
|
}
|
||||||
|
const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
|
||||||
|
if (!media) return null;
|
||||||
|
const mimeType = media.mime ?? "image/jpeg";
|
||||||
|
const result = await describeImageWithModel({
|
||||||
|
cfg,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
provider: providerId,
|
||||||
|
model: modelId,
|
||||||
|
prompt,
|
||||||
|
maxChars,
|
||||||
|
buffer: media.buffer,
|
||||||
|
mimeType,
|
||||||
|
profile: entry.profile,
|
||||||
|
preferredProfile: entry.preferredProfile,
|
||||||
|
});
|
||||||
|
return {
|
||||||
|
kind: "image.description",
|
||||||
|
attachmentIndex: attachment.index,
|
||||||
|
text: trimOutput(result.text, maxChars),
|
||||||
|
provider: providerId,
|
||||||
|
model: result.model,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
|
||||||
|
if (!provider) {
|
||||||
|
throw new Error(`Media provider not available: ${providerId}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (capability === "audio") {
|
||||||
|
if (!provider.transcribeAudio) {
|
||||||
|
throw new Error(`Audio transcription provider "${providerId}" not available.`);
|
||||||
|
}
|
||||||
|
const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
|
||||||
|
if (!media) return null;
|
||||||
|
const key = await resolveApiKeyForProvider({
|
||||||
|
provider: providerId,
|
||||||
|
cfg,
|
||||||
|
profileId: entry.profile,
|
||||||
|
preferredProfile: entry.preferredProfile,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
});
|
||||||
|
const providerConfig = cfg.models?.providers?.[providerId];
|
||||||
|
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
|
||||||
|
const result = await provider.transcribeAudio({
|
||||||
|
buffer: media.buffer,
|
||||||
|
fileName: media.fileName,
|
||||||
|
mime: media.mime,
|
||||||
|
apiKey: key.apiKey,
|
||||||
|
baseUrl: providerConfig?.baseUrl,
|
||||||
|
headers: providerConfig?.headers,
|
||||||
|
model,
|
||||||
|
language: entry.language ?? cfg.tools?.media?.audio?.language,
|
||||||
|
prompt,
|
||||||
|
timeoutMs,
|
||||||
|
});
|
||||||
|
return {
|
||||||
|
kind: "audio.transcription",
|
||||||
|
attachmentIndex: attachment.index,
|
||||||
|
text: trimOutput(result.text, maxChars),
|
||||||
|
provider: providerId,
|
||||||
|
model: result.model ?? model,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (capability === "video") {
|
||||||
|
if (!provider.describeVideo) {
|
||||||
|
throw new Error(`Video understanding provider "${providerId}" not available.`);
|
||||||
|
}
|
||||||
|
const media = await loadAttachmentBuffer({ attachment, maxBytes, timeoutMs });
|
||||||
|
if (!media) return null;
|
||||||
|
const estimatedBase64Bytes = estimateBase64Size(media.buffer.length);
|
||||||
|
const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
|
||||||
|
if (estimatedBase64Bytes > maxBase64Bytes) {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(
|
||||||
|
`Skipping video attachment ${attachment.index + 1}: base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
const key = await resolveApiKeyForProvider({
|
||||||
|
provider: providerId,
|
||||||
|
cfg,
|
||||||
|
profileId: entry.profile,
|
||||||
|
preferredProfile: entry.preferredProfile,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
});
|
||||||
|
const providerConfig = cfg.models?.providers?.[providerId];
|
||||||
|
const result = await provider.describeVideo({
|
||||||
|
buffer: media.buffer,
|
||||||
|
fileName: media.fileName,
|
||||||
|
mime: media.mime,
|
||||||
|
apiKey: key.apiKey,
|
||||||
|
baseUrl: providerConfig?.baseUrl,
|
||||||
|
headers: providerConfig?.headers,
|
||||||
|
model: entry.model,
|
||||||
|
prompt,
|
||||||
|
timeoutMs,
|
||||||
|
});
|
||||||
|
return {
|
||||||
|
kind: "video.description",
|
||||||
|
attachmentIndex: attachment.index,
|
||||||
|
text: trimOutput(result.text, maxChars),
|
||||||
|
provider: providerId,
|
||||||
|
model: result.model ?? entry.model,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runCliEntry(params: {
|
||||||
|
capability: Capability;
|
||||||
|
entry: MediaUnderstandingModelConfig;
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
ctx: MsgContext;
|
||||||
|
attachment: MediaAttachment;
|
||||||
|
}): Promise<MediaUnderstandingOutput | null> {
|
||||||
|
const { entry, capability, cfg, ctx, attachment } = params;
|
||||||
|
const command = entry.command?.trim();
|
||||||
|
const args = entry.args ?? [];
|
||||||
|
if (!command) {
|
||||||
|
throw new Error(`CLI entry missing command for ${capability}`);
|
||||||
|
}
|
||||||
|
const maxBytes = entry.maxBytes ?? resolveConfigValue(cfg.tools?.media?.[capability]?.maxBytes, DEFAULT_MAX_BYTES[capability]);
|
||||||
|
const maxChars = resolveMaxChars({ capability, entry, cfg });
|
||||||
|
const timeoutMs = resolveTimeoutMs(
|
||||||
|
entry.timeoutSeconds ?? cfg.tools?.media?.[capability]?.timeoutSeconds,
|
||||||
|
DEFAULT_TIMEOUT_SECONDS[capability],
|
||||||
|
);
|
||||||
|
const prompt = resolvePrompt(
|
||||||
|
capability,
|
||||||
|
entry.prompt ?? cfg.tools?.media?.[capability]?.prompt,
|
||||||
|
maxChars,
|
||||||
|
);
|
||||||
|
const pathResult = await resolveAttachmentPath({
|
||||||
|
attachment,
|
||||||
|
maxBytes,
|
||||||
|
timeoutMs,
|
||||||
|
});
|
||||||
|
if (!pathResult) return null;
|
||||||
|
|
||||||
|
const templCtx: MsgContext = {
|
||||||
|
...ctx,
|
||||||
|
MediaPath: pathResult.path,
|
||||||
|
Prompt: prompt,
|
||||||
|
MaxChars: maxChars,
|
||||||
|
};
|
||||||
|
const argv = [command, ...args].map((part, index) =>
|
||||||
|
index === 0 ? part : applyTemplate(part, templCtx),
|
||||||
|
);
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const { stdout } = await runExec(argv[0], argv.slice(1), {
|
||||||
|
timeoutMs,
|
||||||
|
maxBuffer: CLI_OUTPUT_MAX_BUFFER,
|
||||||
|
});
|
||||||
|
const text = trimOutput(stdout, maxChars);
|
||||||
|
if (!text) return null;
|
||||||
|
return {
|
||||||
|
kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
|
||||||
|
attachmentIndex: attachment.index,
|
||||||
|
text,
|
||||||
|
provider: "cli",
|
||||||
|
model: command,
|
||||||
|
};
|
||||||
|
} finally {
|
||||||
|
if (pathResult.cleanup) {
|
||||||
|
await pathResult.cleanup();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runCapability(params: {
|
||||||
|
capability: Capability;
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
ctx: MsgContext;
|
||||||
|
attachments: MediaAttachment[];
|
||||||
|
agentDir?: string;
|
||||||
|
providerRegistry: Map<string, MediaUnderstandingProvider>;
|
||||||
|
}): Promise<MediaUnderstandingOutput | null> {
|
||||||
|
const { capability, cfg, ctx, attachments } = params;
|
||||||
|
const config = resolveCapabilityConfig(cfg, capability);
|
||||||
|
if (!config || config.enabled === false) return null;
|
||||||
|
const entries = resolveModelEntries(config, capability);
|
||||||
|
if (entries.length === 0) return null;
|
||||||
|
|
||||||
|
const scopeDecision = resolveScopeDecision({ scope: config.scope, ctx });
|
||||||
|
if (scopeDecision === "deny") {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(`${capability} understanding disabled by scope policy.`);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const attachment = attachments.find((item) => {
|
||||||
|
if (capability === "image") return isImageAttachment(item);
|
||||||
|
if (capability === "audio") return isAudioAttachment(item);
|
||||||
|
return isVideoAttachment(item);
|
||||||
|
});
|
||||||
|
if (!attachment) return null;
|
||||||
|
|
||||||
|
for (const entry of entries) {
|
||||||
|
try {
|
||||||
|
const entryType = entry.type ?? (entry.command ? "cli" : "provider");
|
||||||
|
const result =
|
||||||
|
entryType === "cli"
|
||||||
|
? await runCliEntry({ capability, entry, cfg, ctx, attachment })
|
||||||
|
: await runProviderEntry({
|
||||||
|
capability,
|
||||||
|
entry,
|
||||||
|
cfg,
|
||||||
|
ctx,
|
||||||
|
attachment,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
providerRegistry: params.providerRegistry,
|
||||||
|
});
|
||||||
|
if (result) return result;
|
||||||
|
} catch (err) {
|
||||||
|
if (isMaxBytesError(err)) {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(`Skipping ${capability} model due to size: ${String(err)}`);
|
||||||
|
}
|
||||||
|
} else if (shouldLogVerbose()) {
|
||||||
|
logVerbose(`${capability} understanding failed: ${String(err)}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function applyMediaUnderstanding(params: {
|
||||||
|
ctx: MsgContext;
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
agentDir?: string;
|
||||||
|
providers?: Record<string, MediaUnderstandingProvider>;
|
||||||
|
}): Promise<ApplyMediaUnderstandingResult> {
|
||||||
|
const { ctx, cfg } = params;
|
||||||
|
const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body];
|
||||||
|
const originalUserText =
|
||||||
|
commandCandidates
|
||||||
|
.map((value) => extractMediaUserText(value))
|
||||||
|
.find((value) => value && value.trim()) ?? undefined;
|
||||||
|
|
||||||
|
const attachments = normalizeAttachments(ctx);
|
||||||
|
const providerRegistry = buildMediaUnderstandingRegistry(params.providers);
|
||||||
|
const outputs: MediaUnderstandingOutput[] = [];
|
||||||
|
|
||||||
|
const imageOutput = await runCapability({
|
||||||
|
capability: "image",
|
||||||
|
cfg,
|
||||||
|
ctx,
|
||||||
|
attachments,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
providerRegistry,
|
||||||
|
});
|
||||||
|
if (imageOutput) outputs.push(imageOutput);
|
||||||
|
|
||||||
|
const audioOutput = await runCapability({
|
||||||
|
capability: "audio",
|
||||||
|
cfg,
|
||||||
|
ctx,
|
||||||
|
attachments,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
providerRegistry,
|
||||||
|
});
|
||||||
|
if (audioOutput) outputs.push(audioOutput);
|
||||||
|
|
||||||
|
const videoOutput = await runCapability({
|
||||||
|
capability: "video",
|
||||||
|
cfg,
|
||||||
|
ctx,
|
||||||
|
attachments,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
providerRegistry,
|
||||||
|
});
|
||||||
|
if (videoOutput) outputs.push(videoOutput);
|
||||||
|
|
||||||
|
if (outputs.length > 0) {
|
||||||
|
ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs });
|
||||||
|
const audioResult = outputs.find((output) => output.kind === "audio.transcription");
|
||||||
|
if (audioResult) {
|
||||||
|
ctx.Transcript = audioResult.text;
|
||||||
|
ctx.CommandBody = audioResult.text;
|
||||||
|
ctx.RawBody = audioResult.text;
|
||||||
|
} else if (originalUserText) {
|
||||||
|
ctx.CommandBody = originalUserText;
|
||||||
|
ctx.RawBody = originalUserText;
|
||||||
|
}
|
||||||
|
ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
outputs,
|
||||||
|
appliedImage: outputs.some((output) => output.kind === "image.description"),
|
||||||
|
appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
|
||||||
|
appliedVideo: outputs.some((output) => output.kind === "video.description"),
|
||||||
|
};
|
||||||
|
}
|
||||||
91
src/media-understanding/format.test.ts
Normal file
91
src/media-understanding/format.test.ts
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
import { formatMediaUnderstandingBody } from "./format.js";
|
||||||
|
|
||||||
|
describe("formatMediaUnderstandingBody", () => {
|
||||||
|
it("replaces placeholder body with transcript", () => {
|
||||||
|
const body = formatMediaUnderstandingBody({
|
||||||
|
body: "<media:audio>",
|
||||||
|
outputs: [
|
||||||
|
{
|
||||||
|
kind: "audio.transcription",
|
||||||
|
attachmentIndex: 0,
|
||||||
|
text: "hello world",
|
||||||
|
provider: "groq",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
expect(body).toBe("[Audio]\nTranscript:\nhello world");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("includes user text when body is meaningful", () => {
|
||||||
|
const body = formatMediaUnderstandingBody({
|
||||||
|
body: "caption here",
|
||||||
|
outputs: [
|
||||||
|
{
|
||||||
|
kind: "audio.transcription",
|
||||||
|
attachmentIndex: 0,
|
||||||
|
text: "transcribed",
|
||||||
|
provider: "groq",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("strips leading media placeholders from user text", () => {
|
||||||
|
const body = formatMediaUnderstandingBody({
|
||||||
|
body: "<media:audio> caption here",
|
||||||
|
outputs: [
|
||||||
|
{
|
||||||
|
kind: "audio.transcription",
|
||||||
|
attachmentIndex: 0,
|
||||||
|
text: "transcribed",
|
||||||
|
provider: "groq",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
expect(body).toBe("[Audio]\nUser text:\ncaption here\nTranscript:\ntranscribed");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("keeps user text once when multiple outputs exist", () => {
|
||||||
|
const body = formatMediaUnderstandingBody({
|
||||||
|
body: "caption here",
|
||||||
|
outputs: [
|
||||||
|
{
|
||||||
|
kind: "audio.transcription",
|
||||||
|
attachmentIndex: 0,
|
||||||
|
text: "audio text",
|
||||||
|
provider: "groq",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
kind: "video.description",
|
||||||
|
attachmentIndex: 1,
|
||||||
|
text: "video text",
|
||||||
|
provider: "google",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
expect(body).toBe(
|
||||||
|
[
|
||||||
|
"User text:\ncaption here",
|
||||||
|
"[Audio]\nTranscript:\naudio text",
|
||||||
|
"[Video]\nDescription:\nvideo text",
|
||||||
|
].join("\n\n"),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("formats image outputs", () => {
|
||||||
|
const body = formatMediaUnderstandingBody({
|
||||||
|
body: "<media:image>",
|
||||||
|
outputs: [
|
||||||
|
{
|
||||||
|
kind: "image.description",
|
||||||
|
attachmentIndex: 0,
|
||||||
|
text: "a cat",
|
||||||
|
provider: "openai",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
expect(body).toBe("[Image]\nDescription:\na cat");
|
||||||
|
});
|
||||||
|
});
|
||||||
77
src/media-understanding/format.ts
Normal file
77
src/media-understanding/format.ts
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
import type { MediaUnderstandingOutput } from "./types.js";
|
||||||
|
|
||||||
|
const MEDIA_PLACEHOLDER_RE = /^<media:[^>]+>(\s*\([^)]*\))?$/i;
|
||||||
|
const MEDIA_PLACEHOLDER_TOKEN_RE = /^<media:[^>]+>(\s*\([^)]*\))?\s*/i;
|
||||||
|
|
||||||
|
export function extractMediaUserText(body?: string): string | undefined {
|
||||||
|
const trimmed = body?.trim() ?? "";
|
||||||
|
if (!trimmed) return undefined;
|
||||||
|
if (MEDIA_PLACEHOLDER_RE.test(trimmed)) return undefined;
|
||||||
|
const cleaned = trimmed.replace(MEDIA_PLACEHOLDER_TOKEN_RE, "").trim();
|
||||||
|
return cleaned || undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatSection(
|
||||||
|
title: "Audio" | "Video" | "Image",
|
||||||
|
kind: "Transcript" | "Description",
|
||||||
|
text: string,
|
||||||
|
userText?: string,
|
||||||
|
): string {
|
||||||
|
const lines = [`[${title}]`];
|
||||||
|
if (userText) {
|
||||||
|
lines.push(`User text:\n${userText}`);
|
||||||
|
}
|
||||||
|
lines.push(`${kind}:\n${text}`);
|
||||||
|
return lines.join("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
export function formatMediaUnderstandingBody(params: {
|
||||||
|
body?: string;
|
||||||
|
outputs: MediaUnderstandingOutput[];
|
||||||
|
}): string {
|
||||||
|
const outputs = params.outputs.filter((output) => output.text.trim());
|
||||||
|
if (outputs.length === 0) {
|
||||||
|
return params.body ?? "";
|
||||||
|
}
|
||||||
|
|
||||||
|
const userText = extractMediaUserText(params.body);
|
||||||
|
const sections: string[] = [];
|
||||||
|
if (userText && outputs.length > 1) {
|
||||||
|
sections.push(`User text:\n${userText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const output of outputs) {
|
||||||
|
if (output.kind === "audio.transcription") {
|
||||||
|
sections.push(
|
||||||
|
formatSection(
|
||||||
|
"Audio",
|
||||||
|
"Transcript",
|
||||||
|
output.text,
|
||||||
|
outputs.length === 1 ? userText : undefined,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (output.kind === "image.description") {
|
||||||
|
sections.push(
|
||||||
|
formatSection(
|
||||||
|
"Image",
|
||||||
|
"Description",
|
||||||
|
output.text,
|
||||||
|
outputs.length === 1 ? userText : undefined,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
sections.push(
|
||||||
|
formatSection(
|
||||||
|
"Video",
|
||||||
|
"Description",
|
||||||
|
output.text,
|
||||||
|
outputs.length === 1 ? userText : undefined,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sections.join("\n\n").trim();
|
||||||
|
}
|
||||||
9
src/media-understanding/index.ts
Normal file
9
src/media-understanding/index.ts
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
export { applyMediaUnderstanding } from "./apply.js";
|
||||||
|
export { formatMediaUnderstandingBody } from "./format.js";
|
||||||
|
export { resolveMediaUnderstandingScope } from "./scope.js";
|
||||||
|
export type {
|
||||||
|
MediaAttachment,
|
||||||
|
MediaUnderstandingOutput,
|
||||||
|
MediaUnderstandingProvider,
|
||||||
|
MediaUnderstandingKind,
|
||||||
|
} from "./types.js";
|
||||||
7
src/media-understanding/providers/google/index.ts
Normal file
7
src/media-understanding/providers/google/index.ts
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||||
|
import { describeGeminiVideo } from "./video.js";
|
||||||
|
|
||||||
|
export const googleProvider: MediaUnderstandingProvider = {
|
||||||
|
id: "google",
|
||||||
|
describeVideo: describeGeminiVideo,
|
||||||
|
};
|
||||||
93
src/media-understanding/providers/google/video.test.ts
Normal file
93
src/media-understanding/providers/google/video.test.ts
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
|
||||||
|
import { describeGeminiVideo } from "./video.js";
|
||||||
|
|
||||||
|
const resolveRequestUrl = (input: RequestInfo | URL) => {
|
||||||
|
if (typeof input === "string") return input;
|
||||||
|
if (input instanceof URL) return input.toString();
|
||||||
|
return input.url;
|
||||||
|
};
|
||||||
|
|
||||||
|
describe("describeGeminiVideo", () => {
|
||||||
|
it("respects case-insensitive x-goog-api-key overrides", async () => {
|
||||||
|
let seenKey: string | null = null;
|
||||||
|
const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
|
||||||
|
const headers = new Headers(init?.headers);
|
||||||
|
seenKey = headers.get("x-goog-api-key");
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
candidates: [{ content: { parts: [{ text: "video ok" }] } }],
|
||||||
|
}),
|
||||||
|
{ status: 200, headers: { "content-type": "application/json" } },
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await describeGeminiVideo({
|
||||||
|
buffer: Buffer.from("video"),
|
||||||
|
fileName: "clip.mp4",
|
||||||
|
apiKey: "test-key",
|
||||||
|
timeoutMs: 1000,
|
||||||
|
headers: { "X-Goog-Api-Key": "override" },
|
||||||
|
fetchFn,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(seenKey).toBe("override");
|
||||||
|
expect(result.text).toBe("video ok");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("builds the expected request payload", async () => {
|
||||||
|
let seenUrl: string | null = null;
|
||||||
|
let seenInit: RequestInit | undefined;
|
||||||
|
const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
|
||||||
|
seenUrl = resolveRequestUrl(input);
|
||||||
|
seenInit = init;
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
candidates: [
|
||||||
|
{
|
||||||
|
content: {
|
||||||
|
parts: [{ text: "first" }, { text: " second " }, { text: "" }],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}),
|
||||||
|
{ status: 200, headers: { "content-type": "application/json" } },
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await describeGeminiVideo({
|
||||||
|
buffer: Buffer.from("video-bytes"),
|
||||||
|
fileName: "clip.mp4",
|
||||||
|
apiKey: "test-key",
|
||||||
|
timeoutMs: 1500,
|
||||||
|
baseUrl: "https://example.com/v1beta/",
|
||||||
|
model: "gemini-3-pro",
|
||||||
|
headers: { "X-Other": "1" },
|
||||||
|
fetchFn,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.model).toBe("gemini-3-pro-preview");
|
||||||
|
expect(result.text).toBe("first\nsecond");
|
||||||
|
expect(seenUrl).toBe("https://example.com/v1beta/models/gemini-3-pro-preview:generateContent");
|
||||||
|
expect(seenInit?.method).toBe("POST");
|
||||||
|
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
|
||||||
|
|
||||||
|
const headers = new Headers(seenInit?.headers);
|
||||||
|
expect(headers.get("x-goog-api-key")).toBe("test-key");
|
||||||
|
expect(headers.get("content-type")).toBe("application/json");
|
||||||
|
expect(headers.get("x-other")).toBe("1");
|
||||||
|
|
||||||
|
const bodyText =
|
||||||
|
typeof seenInit?.body === "string"
|
||||||
|
? seenInit.body
|
||||||
|
: Buffer.isBuffer(seenInit?.body)
|
||||||
|
? seenInit.body.toString("utf8")
|
||||||
|
: "";
|
||||||
|
const body = JSON.parse(bodyText);
|
||||||
|
expect(body.contents?.[0]?.parts?.[0]?.text).toBe("Describe the video.");
|
||||||
|
expect(body.contents?.[0]?.parts?.[1]?.inline_data?.mime_type).toBe("video/mp4");
|
||||||
|
expect(body.contents?.[0]?.parts?.[1]?.inline_data?.data).toBe(
|
||||||
|
Buffer.from("video-bytes").toString("base64"),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
84
src/media-understanding/providers/google/video.ts
Normal file
84
src/media-understanding/providers/google/video.ts
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js";
|
||||||
|
import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
|
||||||
|
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
|
||||||
|
|
||||||
|
export const DEFAULT_GOOGLE_VIDEO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
|
||||||
|
const DEFAULT_GOOGLE_VIDEO_MODEL = "gemini-3-flash-preview";
|
||||||
|
const DEFAULT_GOOGLE_VIDEO_PROMPT = "Describe the video.";
|
||||||
|
|
||||||
|
function resolveModel(model?: string): string {
|
||||||
|
const trimmed = model?.trim();
|
||||||
|
if (!trimmed) return DEFAULT_GOOGLE_VIDEO_MODEL;
|
||||||
|
return normalizeGoogleModelId(trimmed);
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolvePrompt(prompt?: string): string {
|
||||||
|
const trimmed = prompt?.trim();
|
||||||
|
return trimmed || DEFAULT_GOOGLE_VIDEO_PROMPT;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function describeGeminiVideo(
|
||||||
|
params: VideoDescriptionRequest,
|
||||||
|
): Promise<VideoDescriptionResult> {
|
||||||
|
const fetchFn = params.fetchFn ?? fetch;
|
||||||
|
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_VIDEO_BASE_URL);
|
||||||
|
const model = resolveModel(params.model);
|
||||||
|
const url = `${baseUrl}/models/${model}:generateContent`;
|
||||||
|
|
||||||
|
const headers = new Headers(params.headers);
|
||||||
|
if (!headers.has("content-type")) {
|
||||||
|
headers.set("content-type", "application/json");
|
||||||
|
}
|
||||||
|
if (!headers.has("x-goog-api-key")) {
|
||||||
|
headers.set("x-goog-api-key", params.apiKey);
|
||||||
|
}
|
||||||
|
|
||||||
|
const body = {
|
||||||
|
contents: [
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
parts: [
|
||||||
|
{ text: resolvePrompt(params.prompt) },
|
||||||
|
{
|
||||||
|
inline_data: {
|
||||||
|
mime_type: params.mime ?? "video/mp4",
|
||||||
|
data: params.buffer.toString("base64"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
const res = await fetchWithTimeout(
|
||||||
|
url,
|
||||||
|
{
|
||||||
|
method: "POST",
|
||||||
|
headers,
|
||||||
|
body: JSON.stringify(body),
|
||||||
|
},
|
||||||
|
params.timeoutMs,
|
||||||
|
fetchFn,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
const detail = await readErrorResponse(res);
|
||||||
|
const suffix = detail ? `: ${detail}` : "";
|
||||||
|
throw new Error(`Video description failed (HTTP ${res.status})${suffix}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const payload = (await res.json()) as {
|
||||||
|
candidates?: Array<{
|
||||||
|
content?: { parts?: Array<{ text?: string }> };
|
||||||
|
}>;
|
||||||
|
};
|
||||||
|
const parts = payload.candidates?.[0]?.content?.parts ?? [];
|
||||||
|
const text = parts
|
||||||
|
.map((part) => part?.text?.trim())
|
||||||
|
.filter(Boolean)
|
||||||
|
.join("\n");
|
||||||
|
if (!text) {
|
||||||
|
throw new Error("Video description response missing text");
|
||||||
|
}
|
||||||
|
return { text, model };
|
||||||
|
}
|
||||||
13
src/media-understanding/providers/groq/index.ts
Normal file
13
src/media-understanding/providers/groq/index.ts
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||||
|
import { transcribeOpenAiCompatibleAudio } from "../openai/audio.js";
|
||||||
|
|
||||||
|
const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1";
|
||||||
|
|
||||||
|
export const groqProvider: MediaUnderstandingProvider = {
|
||||||
|
id: "groq",
|
||||||
|
transcribeAudio: (req) =>
|
||||||
|
transcribeOpenAiCompatibleAudio({
|
||||||
|
...req,
|
||||||
|
baseUrl: req.baseUrl ?? DEFAULT_GROQ_AUDIO_BASE_URL,
|
||||||
|
}),
|
||||||
|
};
|
||||||
35
src/media-understanding/providers/index.ts
Normal file
35
src/media-understanding/providers/index.ts
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
import { normalizeProviderId } from "../../agents/model-selection.js";
|
||||||
|
import type { MediaUnderstandingProvider } from "../types.js";
|
||||||
|
import { googleProvider } from "./google/index.js";
|
||||||
|
import { groqProvider } from "./groq/index.js";
|
||||||
|
import { openaiProvider } from "./openai/index.js";
|
||||||
|
|
||||||
|
const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
|
||||||
|
|
||||||
|
export function normalizeMediaProviderId(id: string): string {
|
||||||
|
const normalized = normalizeProviderId(id);
|
||||||
|
if (normalized === "gemini") return "google";
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function buildMediaUnderstandingRegistry(
|
||||||
|
overrides?: Record<string, MediaUnderstandingProvider>,
|
||||||
|
): Map<string, MediaUnderstandingProvider> {
|
||||||
|
const registry = new Map<string, MediaUnderstandingProvider>();
|
||||||
|
for (const provider of PROVIDERS) {
|
||||||
|
registry.set(normalizeMediaProviderId(provider.id), provider);
|
||||||
|
}
|
||||||
|
if (overrides) {
|
||||||
|
for (const [key, provider] of Object.entries(overrides)) {
|
||||||
|
registry.set(normalizeMediaProviderId(key), provider);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return registry;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getMediaUnderstandingProvider(
|
||||||
|
id: string,
|
||||||
|
registry: Map<string, MediaUnderstandingProvider>,
|
||||||
|
): MediaUnderstandingProvider | undefined {
|
||||||
|
return registry.get(normalizeMediaProviderId(id));
|
||||||
|
}
|
||||||
86
src/media-understanding/providers/openai/audio.test.ts
Normal file
86
src/media-understanding/providers/openai/audio.test.ts
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
|
||||||
|
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
|
||||||
|
|
||||||
|
const resolveRequestUrl = (input: RequestInfo | URL) => {
|
||||||
|
if (typeof input === "string") return input;
|
||||||
|
if (input instanceof URL) return input.toString();
|
||||||
|
return input.url;
|
||||||
|
};
|
||||||
|
|
||||||
|
describe("transcribeOpenAiCompatibleAudio", () => {
|
||||||
|
it("respects lowercase authorization header overrides", async () => {
|
||||||
|
let seenAuth: string | null = null;
|
||||||
|
const fetchFn = async (_input: RequestInfo | URL, init?: RequestInit) => {
|
||||||
|
const headers = new Headers(init?.headers);
|
||||||
|
seenAuth = headers.get("authorization");
|
||||||
|
return new Response(JSON.stringify({ text: "ok" }), {
|
||||||
|
status: 200,
|
||||||
|
headers: { "content-type": "application/json" },
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await transcribeOpenAiCompatibleAudio({
|
||||||
|
buffer: Buffer.from("audio"),
|
||||||
|
fileName: "note.mp3",
|
||||||
|
apiKey: "test-key",
|
||||||
|
timeoutMs: 1000,
|
||||||
|
headers: { authorization: "Bearer override" },
|
||||||
|
fetchFn,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(seenAuth).toBe("Bearer override");
|
||||||
|
expect(result.text).toBe("ok");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("builds the expected request payload", async () => {
|
||||||
|
let seenUrl: string | null = null;
|
||||||
|
let seenInit: RequestInit | undefined;
|
||||||
|
const fetchFn = async (input: RequestInfo | URL, init?: RequestInit) => {
|
||||||
|
seenUrl = resolveRequestUrl(input);
|
||||||
|
seenInit = init;
|
||||||
|
return new Response(JSON.stringify({ text: "hello" }), {
|
||||||
|
status: 200,
|
||||||
|
headers: { "content-type": "application/json" },
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await transcribeOpenAiCompatibleAudio({
|
||||||
|
buffer: Buffer.from("audio-bytes"),
|
||||||
|
fileName: "voice.wav",
|
||||||
|
apiKey: "test-key",
|
||||||
|
timeoutMs: 1234,
|
||||||
|
baseUrl: "https://api.example.com/v1/",
|
||||||
|
model: " ",
|
||||||
|
language: " en ",
|
||||||
|
prompt: " hello ",
|
||||||
|
mime: "audio/wav",
|
||||||
|
headers: { "X-Custom": "1" },
|
||||||
|
fetchFn,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.model).toBe("whisper-1");
|
||||||
|
expect(result.text).toBe("hello");
|
||||||
|
expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions");
|
||||||
|
expect(seenInit?.method).toBe("POST");
|
||||||
|
expect(seenInit?.signal).toBeInstanceOf(AbortSignal);
|
||||||
|
|
||||||
|
const headers = new Headers(seenInit?.headers);
|
||||||
|
expect(headers.get("authorization")).toBe("Bearer test-key");
|
||||||
|
expect(headers.get("x-custom")).toBe("1");
|
||||||
|
|
||||||
|
const form = seenInit?.body as FormData;
|
||||||
|
expect(form).toBeInstanceOf(FormData);
|
||||||
|
expect(form.get("model")).toBe("whisper-1");
|
||||||
|
expect(form.get("language")).toBe("en");
|
||||||
|
expect(form.get("prompt")).toBe("hello");
|
||||||
|
const file = form.get("file") as Blob | { type?: string; name?: string } | null;
|
||||||
|
expect(file).not.toBeNull();
|
||||||
|
if (file) {
|
||||||
|
expect(file.type).toBe("audio/wav");
|
||||||
|
if ("name" in file && typeof file.name === "string") {
|
||||||
|
expect(file.name).toBe("voice.wav");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
61
src/media-understanding/providers/openai/audio.ts
Normal file
61
src/media-understanding/providers/openai/audio.ts
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
import path from "node:path";
|
||||||
|
|
||||||
|
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
|
||||||
|
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
|
||||||
|
|
||||||
|
export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
|
||||||
|
const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1";
|
||||||
|
|
||||||
|
function resolveModel(model?: string): string {
|
||||||
|
const trimmed = model?.trim();
|
||||||
|
return trimmed || DEFAULT_OPENAI_AUDIO_MODEL;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function transcribeOpenAiCompatibleAudio(
|
||||||
|
params: AudioTranscriptionRequest,
|
||||||
|
): Promise<AudioTranscriptionResult> {
|
||||||
|
const fetchFn = params.fetchFn ?? fetch;
|
||||||
|
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_OPENAI_AUDIO_BASE_URL);
|
||||||
|
const url = `${baseUrl}/audio/transcriptions`;
|
||||||
|
|
||||||
|
const model = resolveModel(params.model);
|
||||||
|
const form = new FormData();
|
||||||
|
const fileName = params.fileName?.trim() || path.basename(params.fileName) || "audio";
|
||||||
|
const bytes = new Uint8Array(params.buffer);
|
||||||
|
const blob = new Blob([bytes], {
|
||||||
|
type: params.mime ?? "application/octet-stream",
|
||||||
|
});
|
||||||
|
form.append("file", blob, fileName);
|
||||||
|
form.append("model", model);
|
||||||
|
if (params.language?.trim()) form.append("language", params.language.trim());
|
||||||
|
if (params.prompt?.trim()) form.append("prompt", params.prompt.trim());
|
||||||
|
|
||||||
|
const headers = new Headers(params.headers);
|
||||||
|
if (!headers.has("authorization")) {
|
||||||
|
headers.set("authorization", `Bearer ${params.apiKey}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const res = await fetchWithTimeout(
|
||||||
|
url,
|
||||||
|
{
|
||||||
|
method: "POST",
|
||||||
|
headers,
|
||||||
|
body: form,
|
||||||
|
},
|
||||||
|
params.timeoutMs,
|
||||||
|
fetchFn,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
const detail = await readErrorResponse(res);
|
||||||
|
const suffix = detail ? `: ${detail}` : "";
|
||||||
|
throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const payload = (await res.json()) as { text?: string };
|
||||||
|
const text = payload.text?.trim();
|
||||||
|
if (!text) {
|
||||||
|
throw new Error("Audio transcription response missing text");
|
||||||
|
}
|
||||||
|
return { text, model };
|
||||||
|
}
|
||||||
7
src/media-understanding/providers/openai/index.ts
Normal file
7
src/media-understanding/providers/openai/index.ts
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||||
|
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
|
||||||
|
|
||||||
|
export const openaiProvider: MediaUnderstandingProvider = {
|
||||||
|
id: "openai",
|
||||||
|
transcribeAudio: transcribeOpenAiCompatibleAudio,
|
||||||
|
};
|
||||||
33
src/media-understanding/providers/shared.ts
Normal file
33
src/media-understanding/providers/shared.ts
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
const MAX_ERROR_CHARS = 300;
|
||||||
|
|
||||||
|
export function normalizeBaseUrl(baseUrl: string | undefined, fallback: string): string {
|
||||||
|
const raw = baseUrl?.trim() || fallback;
|
||||||
|
return raw.replace(/\/+$/, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fetchWithTimeout(
|
||||||
|
url: string,
|
||||||
|
init: RequestInit,
|
||||||
|
timeoutMs: number,
|
||||||
|
fetchFn: typeof fetch,
|
||||||
|
): Promise<Response> {
|
||||||
|
const controller = new AbortController();
|
||||||
|
const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
|
||||||
|
try {
|
||||||
|
return await fetchFn(url, { ...init, signal: controller.signal });
|
||||||
|
} finally {
|
||||||
|
clearTimeout(timer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function readErrorResponse(res: Response): Promise<string | undefined> {
|
||||||
|
try {
|
||||||
|
const text = await res.text();
|
||||||
|
const collapsed = text.replace(/\s+/g, " ").trim();
|
||||||
|
if (!collapsed) return undefined;
|
||||||
|
if (collapsed.length <= MAX_ERROR_CHARS) return collapsed;
|
||||||
|
return `${collapsed.slice(0, MAX_ERROR_CHARS)}…`;
|
||||||
|
} catch {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
46
src/media-understanding/scope.test.ts
Normal file
46
src/media-understanding/scope.test.ts
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
import { resolveMediaUnderstandingScope } from "./scope.js";
|
||||||
|
|
||||||
|
describe("resolveMediaUnderstandingScope", () => {
|
||||||
|
it("defaults to allow when scope is undefined", () => {
|
||||||
|
expect(resolveMediaUnderstandingScope({})).toBe("allow");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("uses first matching rule", () => {
|
||||||
|
const decision = resolveMediaUnderstandingScope({
|
||||||
|
scope: {
|
||||||
|
default: "deny",
|
||||||
|
rules: [
|
||||||
|
{ action: "allow", match: { channel: "whatsapp" } },
|
||||||
|
{ action: "deny", match: { channel: "whatsapp", chatType: "direct" } },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
channel: "whatsapp",
|
||||||
|
chatType: "direct",
|
||||||
|
sessionKey: "whatsapp:direct:123",
|
||||||
|
});
|
||||||
|
expect(decision).toBe("allow");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("matches keyPrefix when provided", () => {
|
||||||
|
const decision = resolveMediaUnderstandingScope({
|
||||||
|
scope: {
|
||||||
|
default: "deny",
|
||||||
|
rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }],
|
||||||
|
},
|
||||||
|
sessionKey: "agent:main:whatsapp:group:123",
|
||||||
|
});
|
||||||
|
expect(decision).toBe("allow");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("matches keyPrefix case-insensitively", () => {
|
||||||
|
const decision = resolveMediaUnderstandingScope({
|
||||||
|
scope: {
|
||||||
|
default: "deny",
|
||||||
|
rules: [{ action: "allow", match: { keyPrefix: "agent:main:" } }],
|
||||||
|
},
|
||||||
|
sessionKey: "AGENT:MAIN:WHATSAPP:GROUP:123",
|
||||||
|
});
|
||||||
|
expect(decision).toBe("allow");
|
||||||
|
});
|
||||||
|
});
|
||||||
54
src/media-understanding/scope.ts
Normal file
54
src/media-understanding/scope.ts
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
import type { MediaUnderstandingScopeConfig } from "../config/types.tools.js";
|
||||||
|
|
||||||
|
export type MediaUnderstandingScopeDecision = "allow" | "deny";
|
||||||
|
|
||||||
|
function normalizeDecision(value?: string | null): MediaUnderstandingScopeDecision | undefined {
|
||||||
|
const normalized = value?.trim().toLowerCase();
|
||||||
|
if (normalized === "allow") return "allow";
|
||||||
|
if (normalized === "deny") return "deny";
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeMatch(value?: string | null): string | undefined {
|
||||||
|
const normalized = value?.trim().toLowerCase();
|
||||||
|
return normalized || undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function normalizeMediaUnderstandingChatType(raw?: string | null): string | undefined {
|
||||||
|
const value = raw?.trim().toLowerCase();
|
||||||
|
if (!value) return undefined;
|
||||||
|
if (value === "dm" || value === "direct_message" || value === "private") return "direct";
|
||||||
|
if (value === "groups") return "group";
|
||||||
|
if (value === "channel") return "room";
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveMediaUnderstandingScope(params: {
|
||||||
|
scope?: MediaUnderstandingScopeConfig;
|
||||||
|
sessionKey?: string;
|
||||||
|
channel?: string;
|
||||||
|
chatType?: string;
|
||||||
|
}): MediaUnderstandingScopeDecision {
|
||||||
|
const scope = params.scope;
|
||||||
|
if (!scope) return "allow";
|
||||||
|
|
||||||
|
const channel = normalizeMatch(params.channel);
|
||||||
|
const chatType = normalizeMatch(params.chatType);
|
||||||
|
const sessionKey = normalizeMatch(params.sessionKey) ?? "";
|
||||||
|
|
||||||
|
for (const rule of scope.rules ?? []) {
|
||||||
|
if (!rule) continue;
|
||||||
|
const action = normalizeDecision(rule.action) ?? "allow";
|
||||||
|
const match = rule.match ?? {};
|
||||||
|
const matchChannel = normalizeMatch(match.channel);
|
||||||
|
const matchChatType = normalizeMatch(match.chatType);
|
||||||
|
const matchPrefix = normalizeMatch(match.keyPrefix);
|
||||||
|
|
||||||
|
if (matchChannel && matchChannel !== channel) continue;
|
||||||
|
if (matchChatType && matchChatType !== chatType) continue;
|
||||||
|
if (matchPrefix && !sessionKey.startsWith(matchPrefix)) continue;
|
||||||
|
return action;
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalizeDecision(scope.default) ?? "allow";
|
||||||
|
}
|
||||||
62
src/media-understanding/types.ts
Normal file
62
src/media-understanding/types.ts
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
export type MediaUnderstandingKind =
|
||||||
|
| "audio.transcription"
|
||||||
|
| "video.description"
|
||||||
|
| "image.description";
|
||||||
|
|
||||||
|
export type MediaAttachment = {
|
||||||
|
path?: string;
|
||||||
|
url?: string;
|
||||||
|
mime?: string;
|
||||||
|
index: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type MediaUnderstandingOutput = {
|
||||||
|
kind: MediaUnderstandingKind;
|
||||||
|
attachmentIndex: number;
|
||||||
|
text: string;
|
||||||
|
provider: string;
|
||||||
|
model?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type AudioTranscriptionRequest = {
|
||||||
|
buffer: Buffer;
|
||||||
|
fileName: string;
|
||||||
|
mime?: string;
|
||||||
|
apiKey: string;
|
||||||
|
baseUrl?: string;
|
||||||
|
headers?: Record<string, string>;
|
||||||
|
model?: string;
|
||||||
|
language?: string;
|
||||||
|
prompt?: string;
|
||||||
|
timeoutMs: number;
|
||||||
|
fetchFn?: typeof fetch;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type AudioTranscriptionResult = {
|
||||||
|
text: string;
|
||||||
|
model?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type VideoDescriptionRequest = {
|
||||||
|
buffer: Buffer;
|
||||||
|
fileName: string;
|
||||||
|
mime?: string;
|
||||||
|
apiKey: string;
|
||||||
|
baseUrl?: string;
|
||||||
|
headers?: Record<string, string>;
|
||||||
|
model?: string;
|
||||||
|
prompt?: string;
|
||||||
|
timeoutMs: number;
|
||||||
|
fetchFn?: typeof fetch;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type VideoDescriptionResult = {
|
||||||
|
text: string;
|
||||||
|
model?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type MediaUnderstandingProvider = {
|
||||||
|
id: string;
|
||||||
|
transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
|
||||||
|
describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
|
||||||
|
};
|
||||||
47
src/media/fetch.test.ts
Normal file
47
src/media/fetch.test.ts
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
|
||||||
|
import { fetchRemoteMedia } from "./fetch.js";
|
||||||
|
|
||||||
|
function makeStream(chunks: Uint8Array[]) {
|
||||||
|
return new ReadableStream<Uint8Array>({
|
||||||
|
start(controller) {
|
||||||
|
for (const chunk of chunks) {
|
||||||
|
controller.enqueue(chunk);
|
||||||
|
}
|
||||||
|
controller.close();
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("fetchRemoteMedia", () => {
|
||||||
|
it("rejects when content-length exceeds maxBytes", async () => {
|
||||||
|
const fetchImpl = async () =>
|
||||||
|
new Response(makeStream([new Uint8Array([1, 2, 3, 4, 5])]), {
|
||||||
|
status: 200,
|
||||||
|
headers: { "content-length": "5" },
|
||||||
|
});
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
fetchRemoteMedia({
|
||||||
|
url: "https://example.com/file.bin",
|
||||||
|
fetchImpl,
|
||||||
|
maxBytes: 4,
|
||||||
|
}),
|
||||||
|
).rejects.toThrow("exceeds maxBytes");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("rejects when streamed payload exceeds maxBytes", async () => {
|
||||||
|
const fetchImpl = async () =>
|
||||||
|
new Response(makeStream([new Uint8Array([1, 2, 3]), new Uint8Array([4, 5, 6])]), {
|
||||||
|
status: 200,
|
||||||
|
});
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
fetchRemoteMedia({
|
||||||
|
url: "https://example.com/file.bin",
|
||||||
|
fetchImpl,
|
||||||
|
maxBytes: 4,
|
||||||
|
}),
|
||||||
|
).rejects.toThrow("exceeds maxBytes");
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -14,6 +14,7 @@ type FetchMediaOptions = {
|
|||||||
url: string;
|
url: string;
|
||||||
fetchImpl?: FetchLike;
|
fetchImpl?: FetchLike;
|
||||||
filePathHint?: string;
|
filePathHint?: string;
|
||||||
|
maxBytes?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
function stripQuotes(value: string): string {
|
function stripQuotes(value: string): string {
|
||||||
@@ -51,7 +52,7 @@ async function readErrorBodySnippet(res: Response, maxChars = 200): Promise<stri
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<FetchMediaResult> {
|
export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<FetchMediaResult> {
|
||||||
const { url, fetchImpl, filePathHint } = options;
|
const { url, fetchImpl, filePathHint, maxBytes } = options;
|
||||||
const fetcher: FetchLike | undefined = fetchImpl ?? globalThis.fetch;
|
const fetcher: FetchLike | undefined = fetchImpl ?? globalThis.fetch;
|
||||||
if (!fetcher) {
|
if (!fetcher) {
|
||||||
throw new Error("fetch is not available");
|
throw new Error("fetch is not available");
|
||||||
@@ -77,7 +78,19 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
|
|||||||
throw new Error(`Failed to fetch media from ${url}${redirected}: ${detail}`);
|
throw new Error(`Failed to fetch media from ${url}${redirected}: ${detail}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const buffer = Buffer.from(await res.arrayBuffer());
|
const contentLength = res.headers.get("content-length");
|
||||||
|
if (maxBytes && contentLength) {
|
||||||
|
const length = Number(contentLength);
|
||||||
|
if (Number.isFinite(length) && length > maxBytes) {
|
||||||
|
throw new Error(
|
||||||
|
`Failed to fetch media from ${url}: content length ${length} exceeds maxBytes ${maxBytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const buffer = maxBytes
|
||||||
|
? await readResponseWithLimit(res, maxBytes)
|
||||||
|
: Buffer.from(await res.arrayBuffer());
|
||||||
let fileNameFromUrl: string | undefined;
|
let fileNameFromUrl: string | undefined;
|
||||||
try {
|
try {
|
||||||
const parsed = new URL(url);
|
const parsed = new URL(url);
|
||||||
@@ -109,3 +122,47 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
|
|||||||
fileName,
|
fileName,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function readResponseWithLimit(res: Response, maxBytes: number): Promise<Buffer> {
|
||||||
|
const body = res.body;
|
||||||
|
if (!body || typeof body.getReader !== "function") {
|
||||||
|
const fallback = Buffer.from(await res.arrayBuffer());
|
||||||
|
if (fallback.length > maxBytes) {
|
||||||
|
throw new Error(
|
||||||
|
`Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
const reader = body.getReader();
|
||||||
|
const chunks: Uint8Array[] = [];
|
||||||
|
let total = 0;
|
||||||
|
try {
|
||||||
|
while (true) {
|
||||||
|
const { done, value } = await reader.read();
|
||||||
|
if (done) break;
|
||||||
|
if (value?.length) {
|
||||||
|
total += value.length;
|
||||||
|
if (total > maxBytes) {
|
||||||
|
try {
|
||||||
|
await reader.cancel();
|
||||||
|
} catch {}
|
||||||
|
throw new Error(
|
||||||
|
`Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
chunks.push(value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
reader.releaseLock();
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Buffer.concat(
|
||||||
|
chunks.map((chunk) => Buffer.from(chunk)),
|
||||||
|
total,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user