mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-09 16:14:31 +00:00
feat: improve media auto-detect
This commit is contained in:
84
src/media-understanding/providers/google/audio.ts
Normal file
84
src/media-understanding/providers/google/audio.ts
Normal file
@@ -0,0 +1,84 @@
|
||||
import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
|
||||
import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
|
||||
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
|
||||
|
||||
export const DEFAULT_GOOGLE_AUDIO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
|
||||
const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview";
|
||||
const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio.";
|
||||
|
||||
function resolveModel(model?: string): string {
|
||||
const trimmed = model?.trim();
|
||||
if (!trimmed) return DEFAULT_GOOGLE_AUDIO_MODEL;
|
||||
return normalizeGoogleModelId(trimmed);
|
||||
}
|
||||
|
||||
function resolvePrompt(prompt?: string): string {
|
||||
const trimmed = prompt?.trim();
|
||||
return trimmed || DEFAULT_GOOGLE_AUDIO_PROMPT;
|
||||
}
|
||||
|
||||
export async function transcribeGeminiAudio(
|
||||
params: AudioTranscriptionRequest,
|
||||
): Promise<AudioTranscriptionResult> {
|
||||
const fetchFn = params.fetchFn ?? fetch;
|
||||
const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_AUDIO_BASE_URL);
|
||||
const model = resolveModel(params.model);
|
||||
const url = `${baseUrl}/models/${model}:generateContent`;
|
||||
|
||||
const headers = new Headers(params.headers);
|
||||
if (!headers.has("content-type")) {
|
||||
headers.set("content-type", "application/json");
|
||||
}
|
||||
if (!headers.has("x-goog-api-key")) {
|
||||
headers.set("x-goog-api-key", params.apiKey);
|
||||
}
|
||||
|
||||
const body = {
|
||||
contents: [
|
||||
{
|
||||
role: "user",
|
||||
parts: [
|
||||
{ text: resolvePrompt(params.prompt) },
|
||||
{
|
||||
inline_data: {
|
||||
mime_type: params.mime ?? "audio/wav",
|
||||
data: params.buffer.toString("base64"),
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const res = await fetchWithTimeout(
|
||||
url,
|
||||
{
|
||||
method: "POST",
|
||||
headers,
|
||||
body: JSON.stringify(body),
|
||||
},
|
||||
params.timeoutMs,
|
||||
fetchFn,
|
||||
);
|
||||
|
||||
if (!res.ok) {
|
||||
const detail = await readErrorResponse(res);
|
||||
const suffix = detail ? `: ${detail}` : "";
|
||||
throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
|
||||
}
|
||||
|
||||
const payload = (await res.json()) as {
|
||||
candidates?: Array<{
|
||||
content?: { parts?: Array<{ text?: string }> };
|
||||
}>;
|
||||
};
|
||||
const parts = payload.candidates?.[0]?.content?.parts ?? [];
|
||||
const text = parts
|
||||
.map((part) => part?.text?.trim())
|
||||
.filter(Boolean)
|
||||
.join("\n");
|
||||
if (!text) {
|
||||
throw new Error("Audio transcription response missing text");
|
||||
}
|
||||
return { text, model };
|
||||
}
|
||||
@@ -1,10 +1,12 @@
|
||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||
import { describeImageWithModel } from "../image.js";
|
||||
import { transcribeGeminiAudio } from "./audio.js";
|
||||
import { describeGeminiVideo } from "./video.js";
|
||||
|
||||
export const googleProvider: MediaUnderstandingProvider = {
|
||||
id: "google",
|
||||
capabilities: ["image", "audio", "video"],
|
||||
describeImage: describeImageWithModel,
|
||||
transcribeAudio: transcribeGeminiAudio,
|
||||
describeVideo: describeGeminiVideo,
|
||||
};
|
||||
|
||||
@@ -4,7 +4,7 @@ import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../
|
||||
import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
|
||||
|
||||
export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
|
||||
const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1";
|
||||
const DEFAULT_OPENAI_AUDIO_MODEL = "gpt-4o-mini-transcribe";
|
||||
|
||||
function resolveModel(model?: string): string {
|
||||
const trimmed = model?.trim();
|
||||
|
||||
Reference in New Issue
Block a user