refactor(media): split audio helpers and attachment cache

This commit is contained in:
Peter Steinberger
2026-03-02 22:00:46 +00:00
parent 9bde7f4fde
commit 6545317a2c
18 changed files with 776 additions and 749 deletions

View File

@@ -0,0 +1,54 @@
export const MEDIA_AUDIO_FIELD_KEYS = [
"tools.media.audio.enabled",
"tools.media.audio.maxBytes",
"tools.media.audio.maxChars",
"tools.media.audio.prompt",
"tools.media.audio.timeoutSeconds",
"tools.media.audio.language",
"tools.media.audio.attachments",
"tools.media.audio.models",
"tools.media.audio.scope",
"tools.media.audio.echoTranscript",
"tools.media.audio.echoFormat",
] as const;
type MediaAudioFieldKey = (typeof MEDIA_AUDIO_FIELD_KEYS)[number];
export const MEDIA_AUDIO_FIELD_HELP: Record<MediaAudioFieldKey, string> = {
"tools.media.audio.enabled":
"Enable audio understanding so voice notes or audio clips can be transcribed/summarized for agent context. Disable when audio ingestion is outside policy or unnecessary for your workflows.",
"tools.media.audio.maxBytes":
"Maximum accepted audio payload size in bytes before processing is rejected or clipped by policy. Set this based on expected recording length and upstream provider limits.",
"tools.media.audio.maxChars":
"Maximum characters retained from audio understanding output to prevent oversized transcript injection. Increase for long-form dictation, or lower to keep conversational turns compact.",
"tools.media.audio.prompt":
"Instruction template guiding audio understanding output style, such as concise summary versus near-verbatim transcript. Keep wording consistent so downstream automations can rely on output format.",
"tools.media.audio.timeoutSeconds":
"Timeout in seconds for audio understanding execution before the operation is cancelled. Use longer timeouts for long recordings and tighter ones for interactive chat responsiveness.",
"tools.media.audio.language":
"Preferred language hint for audio understanding/transcription when provider support is available. Set this to improve recognition accuracy for known primary languages.",
"tools.media.audio.attachments":
"Attachment policy for audio inputs indicating which uploaded files are eligible for audio processing. Keep restrictive defaults in mixed-content channels to avoid unintended audio workloads.",
"tools.media.audio.models":
"Ordered model preferences specifically for audio understanding, used before shared media model fallback. Choose models optimized for transcription quality in your primary language/domain.",
"tools.media.audio.scope":
"Scope selector for when audio understanding runs across inbound messages and attachments. Keep focused scopes in high-volume channels to reduce cost and avoid accidental transcription.",
"tools.media.audio.echoTranscript":
"Echo the audio transcript back to the originating chat before agent processing. When enabled, users immediately see what was heard from their voice note, helping them verify transcription accuracy before the agent acts on it. Default: false.",
"tools.media.audio.echoFormat":
"Format string for the echoed transcript message. Use `{transcript}` as a placeholder for the transcribed text. Default: '📝 \"{transcript}\"'.",
};
export const MEDIA_AUDIO_FIELD_LABELS: Record<MediaAudioFieldKey, string> = {
"tools.media.audio.enabled": "Enable Audio Understanding",
"tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
"tools.media.audio.maxChars": "Audio Understanding Max Chars",
"tools.media.audio.prompt": "Audio Understanding Prompt",
"tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
"tools.media.audio.language": "Audio Understanding Language",
"tools.media.audio.attachments": "Audio Understanding Attachment Policy",
"tools.media.audio.models": "Audio Understanding Models",
"tools.media.audio.scope": "Audio Understanding Scope",
"tools.media.audio.echoTranscript": "Echo Transcript to Chat",
"tools.media.audio.echoFormat": "Transcript Echo Format",
};

View File

@@ -1,4 +1,5 @@
import { describe, expect, it } from "vitest";
import { MEDIA_AUDIO_FIELD_KEYS } from "./media-audio-field-metadata.js";
import { FIELD_HELP } from "./schema.help.js";
import { FIELD_LABELS } from "./schema.labels.js";
@@ -457,15 +458,7 @@ const TOOLS_HOOKS_TARGET_KEYS = [
"tools.links.models",
"tools.links.scope",
"tools.links.timeoutSeconds",
"tools.media.audio.attachments",
"tools.media.audio.enabled",
"tools.media.audio.language",
"tools.media.audio.maxBytes",
"tools.media.audio.maxChars",
"tools.media.audio.models",
"tools.media.audio.prompt",
"tools.media.audio.scope",
"tools.media.audio.timeoutSeconds",
...MEDIA_AUDIO_FIELD_KEYS,
"tools.media.concurrency",
"tools.media.image.attachments",
"tools.media.image.enabled",

View File

@@ -1,3 +1,4 @@
import { MEDIA_AUDIO_FIELD_HELP } from "./media-audio-field-metadata.js";
import { IRC_FIELD_HELP } from "./schema.irc.js";
export const FIELD_HELP: Record<string, string> = {
@@ -527,28 +528,7 @@ export const FIELD_HELP: Record<string, string> = {
"Ordered model preferences specifically for image understanding when you want to override shared media models. Put the most reliable multimodal model first to reduce fallback attempts.",
"tools.media.image.scope":
"Scope selector for when image understanding is attempted (for example only explicit requests versus broader auto-detection). Keep narrow scope in busy channels to control token and API spend.",
"tools.media.audio.enabled":
"Enable audio understanding so voice notes or audio clips can be transcribed/summarized for agent context. Disable when audio ingestion is outside policy or unnecessary for your workflows.",
"tools.media.audio.maxBytes":
"Maximum accepted audio payload size in bytes before processing is rejected or clipped by policy. Set this based on expected recording length and upstream provider limits.",
"tools.media.audio.maxChars":
"Maximum characters retained from audio understanding output to prevent oversized transcript injection. Increase for long-form dictation, or lower to keep conversational turns compact.",
"tools.media.audio.prompt":
"Instruction template guiding audio understanding output style, such as concise summary versus near-verbatim transcript. Keep wording consistent so downstream automations can rely on output format.",
"tools.media.audio.timeoutSeconds":
"Timeout in seconds for audio understanding execution before the operation is cancelled. Use longer timeouts for long recordings and tighter ones for interactive chat responsiveness.",
"tools.media.audio.language":
"Preferred language hint for audio understanding/transcription when provider support is available. Set this to improve recognition accuracy for known primary languages.",
"tools.media.audio.attachments":
"Attachment policy for audio inputs indicating which uploaded files are eligible for audio processing. Keep restrictive defaults in mixed-content channels to avoid unintended audio workloads.",
"tools.media.audio.models":
"Ordered model preferences specifically for audio understanding, used before shared media model fallback. Choose models optimized for transcription quality in your primary language/domain.",
"tools.media.audio.scope":
"Scope selector for when audio understanding runs across inbound messages and attachments. Keep focused scopes in high-volume channels to reduce cost and avoid accidental transcription.",
"tools.media.audio.echoTranscript":
"Echo the audio transcript back to the originating chat before agent processing. When enabled, users immediately see what was heard from their voice note, helping them verify transcription accuracy before the agent acts on it. Default: false.",
"tools.media.audio.echoFormat":
"Format string for the echoed transcript message. Use `{transcript}` as a placeholder for the transcribed text. Default: '📝 \"{transcript}\"'.",
...MEDIA_AUDIO_FIELD_HELP,
"tools.media.video.enabled":
"Enable video understanding so clips can be summarized into text for downstream reasoning and responses. Disable when processing video is out of policy or too expensive for your deployment.",
"tools.media.video.maxBytes":

View File

@@ -1,3 +1,4 @@
import { MEDIA_AUDIO_FIELD_LABELS } from "./media-audio-field-metadata.js";
import { IRC_FIELD_LABELS } from "./schema.irc.js";
export const FIELD_LABELS: Record<string, string> = {
@@ -128,17 +129,7 @@ export const FIELD_LABELS: Record<string, string> = {
"tools.media.image.scope": "Image Understanding Scope",
"tools.media.models": "Media Understanding Shared Models",
"tools.media.concurrency": "Media Understanding Concurrency",
"tools.media.audio.enabled": "Enable Audio Understanding",
"tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
"tools.media.audio.maxChars": "Audio Understanding Max Chars",
"tools.media.audio.prompt": "Audio Understanding Prompt",
"tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
"tools.media.audio.language": "Audio Understanding Language",
"tools.media.audio.attachments": "Audio Understanding Attachment Policy",
"tools.media.audio.models": "Audio Understanding Models",
"tools.media.audio.scope": "Audio Understanding Scope",
"tools.media.audio.echoTranscript": "Echo Transcript to Chat",
"tools.media.audio.echoFormat": "Transcript Echo Format",
...MEDIA_AUDIO_FIELD_LABELS,
"tools.media.video.enabled": "Enable Video Understanding",
"tools.media.video.maxBytes": "Video Understanding Max Bytes",
"tools.media.video.maxChars": "Video Understanding Max Chars",