refactor: unify media understanding pipeline

2026-05-09 16:04:33 +00:00 · 2026-01-17 04:38:20 +00:00
parent 49ecbd8fea
commit fcb7c9ff65
24 changed files with 1250 additions and 643 deletions
--- a/src/media-understanding/defaults.ts
+++ b/src/media-understanding/defaults.ts
@@ -0,0 +1,35 @@
+import type { MediaUnderstandingCapability } from "./types.js";
+
+const MB = 1024 * 1024;
+
+export const DEFAULT_MAX_CHARS = 500;
+export const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<
+  MediaUnderstandingCapability,
+  number | undefined
+> = {
+  image: DEFAULT_MAX_CHARS,
+  audio: undefined,
+  video: DEFAULT_MAX_CHARS,
+};
+export const DEFAULT_MAX_BYTES: Record<MediaUnderstandingCapability, number> = {
+  image: 10 * MB,
+  audio: 20 * MB,
+  video: 50 * MB,
+};
+export const DEFAULT_TIMEOUT_SECONDS: Record<MediaUnderstandingCapability, number> = {
+  image: 60,
+  audio: 60,
+  video: 120,
+};
+export const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string> = {
+  image: "Describe the image.",
+  audio: "Transcribe the audio.",
+  video: "Describe the video.",
+};
+export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
+export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
+  groq: "whisper-large-v3-turbo",
+  openai: "whisper-1",
+};
+export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
+export const DEFAULT_MEDIA_CONCURRENCY = 2;