From 7837d23103da587937e52aa00d4bc3050553affd Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 23 Feb 2026 18:24:50 +0000 Subject: [PATCH] feat(media): add moonshot video provider and wiring Co-authored-by: xiaoyaner0201 --- CHANGELOG.md | 1 + src/media-understanding/defaults.test.ts | 12 +- src/media-understanding/defaults.ts | 2 +- .../providers/index.test.ts | 8 + src/media-understanding/providers/index.ts | 2 + .../providers/moonshot/index.ts | 10 ++ .../providers/moonshot/video.test.ts | 72 ++++++++ .../providers/moonshot/video.ts | 109 ++++++++++++ src/media-understanding/runner.entries.ts | 11 +- src/media-understanding/runner.video.test.ts | 162 ++++++++++++++++++ 10 files changed, 385 insertions(+), 4 deletions(-) create mode 100644 src/media-understanding/providers/moonshot/index.ts create mode 100644 src/media-understanding/providers/moonshot/video.test.ts create mode 100644 src/media-understanding/providers/moonshot/video.ts create mode 100644 src/media-understanding/runner.video.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index a3a58aa0bfc..3d8ecc99c5d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Docs: https://docs.openclaw.ai - Agents/Context pruning: extend `cache-ttl` eligibility to Moonshot/Kimi and ZAI/GLM providers (including OpenRouter model refs), so `contextPruning.mode: "cache-ttl"` is no longer silently skipped for those sessions. (#24497) Thanks @lailoo. - Tools/web_search: add `provider: "kimi"` (Moonshot) support with key/config schema wiring and a corrected two-step `$web_search` tool flow that echoes tool results before final synthesis, including citation extraction from search results. (#18822) Thanks @adshine. +- Media understanding/Video: add a native Moonshot video provider and include Moonshot in auto video key detection, plus refactor video execution to honor `entry/config/provider` baseUrl+header precedence (matching audio behavior). (#16616) Thanks @xiaoyaner0201. - Sessions/Store: canonicalize inbound mixed-case session keys for metadata and route updates, and migrate legacy case-variant entries to a single lowercase key to prevent duplicate sessions and missing TUI/WebUI history. (#9561) Thanks @hillghost86. - Telegram/Reactions: soft-fail reaction action errors (policy/token/emoji/API), accept snake_case `message_id`, and fallback to inbound message-id context when explicit `messageId` is omitted so DM reactions stay stable without regeneration loops. (#20236, #21001) Thanks @PeterShanxin and @vincentkoc. - Telegram/Polling: scope persisted polling offsets to bot identity and reuse a single awaited runner-stop path on abort/retry, preventing cross-token offset bleed and overlapping pollers during restart/error recovery. (#10850, #11347) Thanks @talhaorak, @anooprdawar, and @vincentkoc. diff --git a/src/media-understanding/defaults.test.ts b/src/media-understanding/defaults.test.ts index 38523b81637..f7bc540b104 100644 --- a/src/media-understanding/defaults.test.ts +++ b/src/media-understanding/defaults.test.ts @@ -1,5 +1,9 @@ import { describe, expect, it } from "vitest"; -import { AUTO_AUDIO_KEY_PROVIDERS, DEFAULT_AUDIO_MODELS } from "./defaults.js"; +import { + AUTO_AUDIO_KEY_PROVIDERS, + AUTO_VIDEO_KEY_PROVIDERS, + DEFAULT_AUDIO_MODELS, +} from "./defaults.js"; describe("DEFAULT_AUDIO_MODELS", () => { it("includes Mistral Voxtral default", () => { @@ -12,3 +16,9 @@ describe("AUTO_AUDIO_KEY_PROVIDERS", () => { expect(AUTO_AUDIO_KEY_PROVIDERS).toContain("mistral"); }); }); + +describe("AUTO_VIDEO_KEY_PROVIDERS", () => { + it("includes moonshot auto key resolution", () => { + expect(AUTO_VIDEO_KEY_PROVIDERS).toContain("moonshot"); + }); +}); diff --git a/src/media-understanding/defaults.ts b/src/media-understanding/defaults.ts index 22c70f7ca99..67effa90b82 100644 --- a/src/media-understanding/defaults.ts +++ b/src/media-understanding/defaults.ts @@ -48,7 +48,7 @@ export const AUTO_IMAGE_KEY_PROVIDERS = [ "minimax", "zai", ] as const; -export const AUTO_VIDEO_KEY_PROVIDERS = ["google"] as const; +export const AUTO_VIDEO_KEY_PROVIDERS = ["google", "moonshot"] as const; export const DEFAULT_IMAGE_MODELS: Record = { openai: "gpt-5-mini", anthropic: "claude-opus-4-6", diff --git a/src/media-understanding/providers/index.test.ts b/src/media-understanding/providers/index.test.ts index f7bf6406b96..430e89e84a6 100644 --- a/src/media-understanding/providers/index.test.ts +++ b/src/media-understanding/providers/index.test.ts @@ -16,4 +16,12 @@ describe("media-understanding provider registry", () => { expect(provider?.id).toBe("google"); }); + + it("registers the Moonshot provider", () => { + const registry = buildMediaUnderstandingRegistry(); + const provider = getMediaUnderstandingProvider("moonshot", registry); + + expect(provider?.id).toBe("moonshot"); + expect(provider?.capabilities).toEqual(["image", "video"]); + }); }); diff --git a/src/media-understanding/providers/index.ts b/src/media-understanding/providers/index.ts index 526632e9ba2..5aef51790a2 100644 --- a/src/media-understanding/providers/index.ts +++ b/src/media-understanding/providers/index.ts @@ -6,6 +6,7 @@ import { googleProvider } from "./google/index.js"; import { groqProvider } from "./groq/index.js"; import { minimaxProvider } from "./minimax/index.js"; import { mistralProvider } from "./mistral/index.js"; +import { moonshotProvider } from "./moonshot/index.js"; import { openaiProvider } from "./openai/index.js"; import { zaiProvider } from "./zai/index.js"; @@ -15,6 +16,7 @@ const PROVIDERS: MediaUnderstandingProvider[] = [ googleProvider, anthropicProvider, minimaxProvider, + moonshotProvider, mistralProvider, zaiProvider, deepgramProvider, diff --git a/src/media-understanding/providers/moonshot/index.ts b/src/media-understanding/providers/moonshot/index.ts new file mode 100644 index 00000000000..78a525129dc --- /dev/null +++ b/src/media-understanding/providers/moonshot/index.ts @@ -0,0 +1,10 @@ +import type { MediaUnderstandingProvider } from "../../types.js"; +import { describeImageWithModel } from "../image.js"; +import { describeMoonshotVideo } from "./video.js"; + +export const moonshotProvider: MediaUnderstandingProvider = { + id: "moonshot", + capabilities: ["image", "video"], + describeImage: describeImageWithModel, + describeVideo: describeMoonshotVideo, +}; diff --git a/src/media-understanding/providers/moonshot/video.test.ts b/src/media-understanding/providers/moonshot/video.test.ts new file mode 100644 index 00000000000..eba98042884 --- /dev/null +++ b/src/media-understanding/providers/moonshot/video.test.ts @@ -0,0 +1,72 @@ +import { describe, expect, it } from "vitest"; +import { + createRequestCaptureJsonFetch, + installPinnedHostnameTestHooks, +} from "../audio.test-helpers.js"; +import { describeMoonshotVideo } from "./video.js"; + +installPinnedHostnameTestHooks(); + +describe("describeMoonshotVideo", () => { + it("builds an OpenAI-compatible video request", async () => { + const { fetchFn, getRequest } = createRequestCaptureJsonFetch({ + choices: [{ message: { content: "video ok" } }], + }); + + const result = await describeMoonshotVideo({ + buffer: Buffer.from("video-bytes"), + fileName: "clip.mp4", + apiKey: "moonshot-test", + timeoutMs: 1500, + baseUrl: "https://api.moonshot.ai/v1/", + model: "kimi-k2.5", + headers: { "X-Trace": "1" }, + fetchFn, + }); + const { url, init } = getRequest(); + + expect(result.text).toBe("video ok"); + expect(result.model).toBe("kimi-k2.5"); + expect(url).toBe("https://api.moonshot.ai/v1/chat/completions"); + expect(init?.method).toBe("POST"); + expect(init?.signal).toBeInstanceOf(AbortSignal); + + const headers = new Headers(init?.headers); + expect(headers.get("authorization")).toBe("Bearer moonshot-test"); + expect(headers.get("content-type")).toBe("application/json"); + expect(headers.get("x-trace")).toBe("1"); + + const body = JSON.parse(typeof init?.body === "string" ? init.body : "{}") as { + model?: string; + messages?: Array<{ + content?: Array<{ type?: string; text?: string; video_url?: { url?: string } }>; + }>; + }; + expect(body.model).toBe("kimi-k2.5"); + expect(body.messages?.[0]?.content?.[0]).toMatchObject({ + type: "text", + text: "Describe the video.", + }); + expect(body.messages?.[0]?.content?.[1]?.type).toBe("video_url"); + expect(body.messages?.[0]?.content?.[1]?.video_url?.url).toBe( + `data:video/mp4;base64,${Buffer.from("video-bytes").toString("base64")}`, + ); + }); + + it("falls back to reasoning_content when content is empty", async () => { + const { fetchFn } = createRequestCaptureJsonFetch({ + choices: [{ message: { content: "", reasoning_content: "reasoned answer" } }], + }); + + const result = await describeMoonshotVideo({ + buffer: Buffer.from("video"), + fileName: "clip.mp4", + apiKey: "moonshot-test", + timeoutMs: 1000, + fetchFn, + }); + + expect(result.text).toBe("reasoned answer"); + expect(result.model).toBe("kimi-k2.5"); + }); +}); diff --git a/src/media-understanding/providers/moonshot/video.ts b/src/media-understanding/providers/moonshot/video.ts new file mode 100644 index 00000000000..c4548900307 --- /dev/null +++ b/src/media-understanding/providers/moonshot/video.ts @@ -0,0 +1,109 @@ +import type { VideoDescriptionRequest, VideoDescriptionResult } from "../../types.js"; +import { assertOkOrThrowHttpError, fetchWithTimeoutGuarded, normalizeBaseUrl } from "../shared.js"; + +export const DEFAULT_MOONSHOT_VIDEO_BASE_URL = "https://api.moonshot.ai/v1"; +const DEFAULT_MOONSHOT_VIDEO_MODEL = "kimi-k2.5"; +const DEFAULT_MOONSHOT_VIDEO_PROMPT = "Describe the video."; + +type MoonshotVideoPayload = { + choices?: Array<{ + message?: { + content?: string | Array<{ text?: string }>; + reasoning_content?: string; + }; + }>; +}; + +function resolveModel(model?: string): string { + const trimmed = model?.trim(); + return trimmed || DEFAULT_MOONSHOT_VIDEO_MODEL; +} + +function resolvePrompt(prompt?: string): string { + const trimmed = prompt?.trim(); + return trimmed || DEFAULT_MOONSHOT_VIDEO_PROMPT; +} + +function coerceMoonshotText(payload: MoonshotVideoPayload): string | null { + const message = payload.choices?.[0]?.message; + if (!message) { + return null; + } + if (typeof message.content === "string" && message.content.trim()) { + return message.content.trim(); + } + if (Array.isArray(message.content)) { + const text = message.content + .map((part) => (typeof part.text === "string" ? part.text.trim() : "")) + .filter(Boolean) + .join("\n") + .trim(); + if (text) { + return text; + } + } + if (typeof message.reasoning_content === "string" && message.reasoning_content.trim()) { + return message.reasoning_content.trim(); + } + return null; +} + +export async function describeMoonshotVideo( + params: VideoDescriptionRequest, +): Promise { + const fetchFn = params.fetchFn ?? fetch; + const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_MOONSHOT_VIDEO_BASE_URL); + const model = resolveModel(params.model); + const mime = params.mime ?? "video/mp4"; + const prompt = resolvePrompt(params.prompt); + const url = `${baseUrl}/chat/completions`; + + const headers = new Headers(params.headers); + if (!headers.has("content-type")) { + headers.set("content-type", "application/json"); + } + if (!headers.has("authorization")) { + headers.set("authorization", `Bearer ${params.apiKey}`); + } + + const body = { + model, + messages: [ + { + role: "user", + content: [ + { type: "text", text: prompt }, + { + type: "video_url", + video_url: { + url: `data:${mime};base64,${params.buffer.toString("base64")}`, + }, + }, + ], + }, + ], + }; + + const { response: res, release } = await fetchWithTimeoutGuarded( + url, + { + method: "POST", + headers, + body: JSON.stringify(body), + }, + params.timeoutMs, + fetchFn, + ); + + try { + await assertOkOrThrowHttpError(res, "Moonshot video description failed"); + const payload = (await res.json()) as MoonshotVideoPayload; + const text = coerceMoonshotText(payload); + if (!text) { + throw new Error("Moonshot video description response missing content"); + } + return { text, model }; + } finally { + await release(); + } +} diff --git a/src/media-understanding/runner.entries.ts b/src/media-understanding/runner.entries.ts index 19d73a8ece0..3ef48b0ce4f 100644 --- a/src/media-understanding/runner.entries.ts +++ b/src/media-understanding/runner.entries.ts @@ -497,6 +497,13 @@ export async function runProviderEntry(params: { entry, agentDir: params.agentDir, }); + const baseUrl = entry.baseUrl ?? params.config?.baseUrl ?? providerConfig?.baseUrl; + const mergedHeaders = { + ...providerConfig?.headers, + ...params.config?.headers, + ...entry.headers, + }; + const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined; const result = await executeWithApiKeyRotation({ provider: providerId, apiKeys, @@ -506,8 +513,8 @@ export async function runProviderEntry(params: { fileName: media.fileName, mime: media.mime, apiKey, - baseUrl: providerConfig?.baseUrl, - headers: providerConfig?.headers, + baseUrl, + headers, model: entry.model, prompt, timeoutMs, diff --git a/src/media-understanding/runner.video.test.ts b/src/media-understanding/runner.video.test.ts new file mode 100644 index 00000000000..6eba2ad15d4 --- /dev/null +++ b/src/media-understanding/runner.video.test.ts @@ -0,0 +1,162 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { describe, expect, it } from "vitest"; +import type { OpenClawConfig } from "../config/config.js"; +import { withEnvAsync } from "../test-utils/env.js"; +import { createMediaAttachmentCache, normalizeMediaAttachments, runCapability } from "./runner.js"; + +async function withVideoFixture( + filePrefix: string, + run: (params: { + ctx: { MediaPath: string; MediaType: string }; + media: ReturnType; + cache: ReturnType; + }) => Promise, +) { + const tmpPath = path.join(os.tmpdir(), `${filePrefix}-${Date.now().toString()}.mp4`); + await fs.writeFile(tmpPath, Buffer.from("video")); + const ctx = { MediaPath: tmpPath, MediaType: "video/mp4" }; + const media = normalizeMediaAttachments(ctx); + const cache = createMediaAttachmentCache(media, { + localPathRoots: [path.dirname(tmpPath)], + }); + try { + await withEnvAsync({ PATH: "" }, async () => { + await run({ ctx, media, cache }); + }); + } finally { + await cache.cleanup(); + await fs.unlink(tmpPath).catch(() => {}); + } +} + +describe("runCapability video provider wiring", () => { + it("merges video baseUrl and headers with entry precedence", async () => { + let seenBaseUrl: string | undefined; + let seenHeaders: Record | undefined; + + await withVideoFixture("openclaw-video-merge", async ({ ctx, media, cache }) => { + const cfg = { + models: { + providers: { + moonshot: { + apiKey: "provider-key", + baseUrl: "https://provider.example/v1", + headers: { "X-Provider": "1" }, + models: [], + }, + }, + }, + tools: { + media: { + video: { + enabled: true, + baseUrl: "https://config.example/v1", + headers: { "X-Config": "2" }, + models: [ + { + provider: "moonshot", + model: "kimi-k2.5", + baseUrl: "https://entry.example/v1", + headers: { "X-Entry": "3" }, + }, + ], + }, + }, + }, + } as unknown as OpenClawConfig; + + const result = await runCapability({ + capability: "video", + cfg, + ctx, + attachments: cache, + media, + providerRegistry: new Map([ + [ + "moonshot", + { + id: "moonshot", + capabilities: ["video"], + describeVideo: async (req) => { + seenBaseUrl = req.baseUrl; + seenHeaders = req.headers; + return { text: "video ok", model: req.model }; + }, + }, + ], + ]), + }); + + expect(result.outputs[0]?.text).toBe("video ok"); + expect(result.outputs[0]?.provider).toBe("moonshot"); + expect(seenBaseUrl).toBe("https://entry.example/v1"); + expect(seenHeaders).toMatchObject({ + "X-Provider": "1", + "X-Config": "2", + "X-Entry": "3", + }); + }); + }); + + it("auto-selects moonshot for video when google is unavailable", async () => { + await withEnvAsync( + { + GEMINI_API_KEY: undefined, + MOONSHOT_API_KEY: undefined, + }, + async () => { + await withVideoFixture("openclaw-video-auto-moonshot", async ({ ctx, media, cache }) => { + const cfg = { + models: { + providers: { + moonshot: { + apiKey: "moonshot-key", + models: [], + }, + }, + }, + tools: { + media: { + video: { + enabled: true, + }, + }, + }, + } as unknown as OpenClawConfig; + + const result = await runCapability({ + capability: "video", + cfg, + ctx, + attachments: cache, + media, + providerRegistry: new Map([ + [ + "google", + { + id: "google", + capabilities: ["video"], + describeVideo: async () => ({ text: "google" }), + }, + ], + [ + "moonshot", + { + id: "moonshot", + capabilities: ["video"], + describeVideo: async () => ({ text: "moonshot", model: "kimi-k2.5" }), + }, + ], + ]), + }); + + expect(result.decision.outcome).toBe("success"); + expect(result.outputs[0]?.provider).toBe("moonshot"); + expect(result.outputs[0]?.text).toBe("moonshot"); + }); + }, + ); + }); +});