feat: improve media auto-detect

2026-05-09 16:14:31 +00:00 · 2026-01-23 05:47:09 +00:00
parent 1d9f230be4
commit 2dfbd1c1f6
6 changed files with 561 additions and 38 deletions
--- a/src/media-understanding/providers/google/audio.ts
+++ b/src/media-understanding/providers/google/audio.ts
@@ -0,0 +1,84 @@
+import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../types.js";
+import { normalizeGoogleModelId } from "../../../agents/models-config.providers.js";
+import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";
+
+export const DEFAULT_GOOGLE_AUDIO_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
+const DEFAULT_GOOGLE_AUDIO_MODEL = "gemini-3-flash-preview";
+const DEFAULT_GOOGLE_AUDIO_PROMPT = "Transcribe the audio.";
+
+function resolveModel(model?: string): string {
+  const trimmed = model?.trim();
+  if (!trimmed) return DEFAULT_GOOGLE_AUDIO_MODEL;
+  return normalizeGoogleModelId(trimmed);
+}
+
+function resolvePrompt(prompt?: string): string {
+  const trimmed = prompt?.trim();
+  return trimmed || DEFAULT_GOOGLE_AUDIO_PROMPT;
+}
+
+export async function transcribeGeminiAudio(
+  params: AudioTranscriptionRequest,
+): Promise<AudioTranscriptionResult> {
+  const fetchFn = params.fetchFn ?? fetch;
+  const baseUrl = normalizeBaseUrl(params.baseUrl, DEFAULT_GOOGLE_AUDIO_BASE_URL);
+  const model = resolveModel(params.model);
+  const url = `${baseUrl}/models/${model}:generateContent`;
+
+  const headers = new Headers(params.headers);
+  if (!headers.has("content-type")) {
+    headers.set("content-type", "application/json");
+  }
+  if (!headers.has("x-goog-api-key")) {
+    headers.set("x-goog-api-key", params.apiKey);
+  }
+
+  const body = {
+    contents: [
+      {
+        role: "user",
+        parts: [
+          { text: resolvePrompt(params.prompt) },
+          {
+            inline_data: {
+              mime_type: params.mime ?? "audio/wav",
+              data: params.buffer.toString("base64"),
+            },
+          },
+        ],
+      },
+    ],
+  };
+
+  const res = await fetchWithTimeout(
+    url,
+    {
+      method: "POST",
+      headers,
+      body: JSON.stringify(body),
+    },
+    params.timeoutMs,
+    fetchFn,
+  );
+
+  if (!res.ok) {
+    const detail = await readErrorResponse(res);
+    const suffix = detail ? `: ${detail}` : "";
+    throw new Error(`Audio transcription failed (HTTP ${res.status})${suffix}`);
+  }
+
+  const payload = (await res.json()) as {
+    candidates?: Array<{
+      content?: { parts?: Array<{ text?: string }> };
+    }>;
+  };
+  const parts = payload.candidates?.[0]?.content?.parts ?? [];
+  const text = parts
+    .map((part) => part?.text?.trim())
+    .filter(Boolean)
+    .join("\n");
+  if (!text) {
+    throw new Error("Audio transcription response missing text");
+  }
+  return { text, model };
+}
--- a/src/media-understanding/providers/google/index.ts
+++ b/src/media-understanding/providers/google/index.ts
@@ -1,10 +1,12 @@
 import type { MediaUnderstandingProvider } from "../../types.js";
 import { describeImageWithModel } from "../image.js";
+import { transcribeGeminiAudio } from "./audio.js";
 import { describeGeminiVideo } from "./video.js";

 export const googleProvider: MediaUnderstandingProvider = {
  id: "google",
  capabilities: ["image", "audio", "video"],
  describeImage: describeImageWithModel,
+  transcribeAudio: transcribeGeminiAudio,
  describeVideo: describeGeminiVideo,
 };
--- a/src/media-understanding/providers/openai/audio.ts
+++ b/src/media-understanding/providers/openai/audio.ts
@@ -4,7 +4,7 @@ import type { AudioTranscriptionRequest, AudioTranscriptionResult } from "../../
 import { fetchWithTimeout, normalizeBaseUrl, readErrorResponse } from "../shared.js";

 export const DEFAULT_OPENAI_AUDIO_BASE_URL = "https://api.openai.com/v1";
-const DEFAULT_OPENAI_AUDIO_MODEL = "whisper-1";
+const DEFAULT_OPENAI_AUDIO_MODEL = "gpt-4o-mini-transcribe";

 function resolveModel(model?: string): string {
  const trimmed = model?.trim();