fix(media): strip audio attachments after successful transcription (openclaw#9076) thanks @nobrainer-tech

Verified:
- pnpm install --frozen-lockfile
- pnpm build
- pnpm check
- pnpm test (fails in known unrelated telegram suite)
- pnpm vitest run src/auto-reply/media-note.test.ts src/auto-reply/reply.media-note.test.ts

Co-authored-by: nobrainer-tech <445466+nobrainer-tech@users.noreply.github.com>
This commit is contained in:
Arkadiusz Mastalerz
2026-02-13 02:01:53 +01:00
committed by GitHub
parent a6003d6711
commit 7081dee1af
2 changed files with 151 additions and 1 deletions

View File

@@ -17,12 +17,45 @@ function formatMediaAttachedLine(params: {
return `${prefix}${params.path}${typePart}${urlPart}]`;
}
// Common audio file extensions for transcription detection
const AUDIO_EXTENSIONS = new Set([
".ogg",
".opus",
".mp3",
".m4a",
".wav",
".webm",
".flac",
".aac",
".wma",
".aiff",
".alac",
".oga",
]);
function isAudioPath(path: string | undefined): boolean {
if (!path) {
return false;
}
const lower = path.toLowerCase();
for (const ext of AUDIO_EXTENSIONS) {
if (lower.endsWith(ext)) {
return true;
}
}
return false;
}
export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
// Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
const suppressed = new Set<number>();
const transcribedAudioIndices = new Set<number>();
if (Array.isArray(ctx.MediaUnderstanding)) {
for (const output of ctx.MediaUnderstanding) {
suppressed.add(output.attachmentIndex);
if (output.kind === "audio.transcription") {
transcribedAudioIndices.add(output.attachmentIndex);
}
}
}
if (Array.isArray(ctx.MediaUnderstandingDecisions)) {
@@ -33,6 +66,9 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
for (const attachment of decision.attachments) {
if (attachment.chosen?.outcome === "success") {
suppressed.add(attachment.attachmentIndex);
if (decision.capability === "audio") {
transcribedAudioIndices.add(attachment.attachmentIndex);
}
}
}
}
@@ -56,6 +92,10 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
Array.isArray(ctx.MediaTypes) && ctx.MediaTypes.length === paths.length
? ctx.MediaTypes
: undefined;
const hasTranscript = Boolean(ctx.Transcript?.trim());
// Transcript alone does not identify an attachment index; only use it as a fallback
// when there is a single attachment to avoid stripping unrelated audio files.
const canStripSingleAttachmentByTranscript = hasTranscript && paths.length === 1;
const entries = paths
.map((entry, index) => ({
@@ -64,7 +104,28 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
url: urls?.[index] ?? ctx.MediaUrl,
index,
}))
.filter((entry) => !suppressed.has(entry.index));
.filter((entry) => {
if (suppressed.has(entry.index)) {
return false;
}
// Strip audio attachments when transcription succeeded - the transcript is already
// available in the context, raw audio binary would only waste tokens (issue #4197)
// Note: Only trust MIME type from per-entry types array, not fallback ctx.MediaType
// which could misclassify non-audio attachments (greptile review feedback)
const hasPerEntryType = types !== undefined;
const isAudioByMime = hasPerEntryType && entry.type?.toLowerCase().startsWith("audio/");
const isAudioEntry = isAudioPath(entry.path) || isAudioByMime;
if (!isAudioEntry) {
return true;
}
if (
transcribedAudioIndices.has(entry.index) ||
(canStripSingleAttachmentByTranscript && entry.index === 0)
) {
return false;
}
return true;
});
if (entries.length === 0) {
return undefined;
}