From 7081dee1aff61e980bb8fc42c68ba3eb669a78cc Mon Sep 17 00:00:00 2001 From: Arkadiusz Mastalerz Date: Fri, 13 Feb 2026 02:01:53 +0100 Subject: [PATCH] fix(media): strip audio attachments after successful transcription (openclaw#9076) thanks @nobrainer-tech Verified: - pnpm install --frozen-lockfile - pnpm build - pnpm check - pnpm test (fails in known unrelated telegram suite) - pnpm vitest run src/auto-reply/media-note.test.ts src/auto-reply/reply.media-note.test.ts Co-authored-by: nobrainer-tech <445466+nobrainer-tech@users.noreply.github.com> --- src/auto-reply/media-note.test.ts | 89 +++++++++++++++++++++++++++++++ src/auto-reply/media-note.ts | 63 +++++++++++++++++++++- 2 files changed, 151 insertions(+), 1 deletion(-) diff --git a/src/auto-reply/media-note.test.ts b/src/auto-reply/media-note.test.ts index 5d9ae04cbcf..3eb357bff89 100644 --- a/src/auto-reply/media-note.test.ts +++ b/src/auto-reply/media-note.test.ts @@ -106,4 +106,93 @@ describe("buildInboundMediaNote", () => { }); expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]"); }); + + it("strips audio attachments when transcription succeeded via MediaUnderstanding (issue #4197)", () => { + const note = buildInboundMediaNote({ + MediaPaths: ["/tmp/voice.ogg", "/tmp/image.png"], + MediaUrls: ["https://example.com/voice.ogg", "https://example.com/image.png"], + MediaTypes: ["audio/ogg", "image/png"], + MediaUnderstanding: [ + { + kind: "audio.transcription", + attachmentIndex: 0, + text: "Hello world", + provider: "whisper", + }, + ], + }); + // Audio attachment should be stripped (already transcribed), image should remain + expect(note).toBe( + "[media attached: /tmp/image.png (image/png) | https://example.com/image.png]", + ); + }); + + it("only strips audio attachments that were transcribed", () => { + const note = buildInboundMediaNote({ + MediaPaths: ["/tmp/voice-1.ogg", "/tmp/voice-2.ogg"], + MediaUrls: ["https://example.com/voice-1.ogg", "https://example.com/voice-2.ogg"], + MediaTypes: ["audio/ogg", "audio/ogg"], + MediaUnderstanding: [ + { + kind: "audio.transcription", + attachmentIndex: 0, + text: "First transcript", + provider: "whisper", + }, + ], + }); + expect(note).toBe( + "[media attached: /tmp/voice-2.ogg (audio/ogg) | https://example.com/voice-2.ogg]", + ); + }); + + it("strips audio attachments when Transcript is present (issue #4197)", () => { + const note = buildInboundMediaNote({ + MediaPaths: ["/tmp/voice.opus"], + MediaTypes: ["audio/opus"], + Transcript: "Hello world from Whisper", + }); + // Audio should be stripped when transcript is available + expect(note).toBeUndefined(); + }); + + it("does not strip multiple audio attachments using transcript-only fallback", () => { + const note = buildInboundMediaNote({ + MediaPaths: ["/tmp/voice-1.ogg", "/tmp/voice-2.ogg"], + MediaTypes: ["audio/ogg", "audio/ogg"], + Transcript: "Transcript text without per-attachment mapping", + }); + expect(note).toBe( + [ + "[media attached: 2 files]", + "[media attached 1/2: /tmp/voice-1.ogg (audio/ogg)]", + "[media attached 2/2: /tmp/voice-2.ogg (audio/ogg)]", + ].join("\n"), + ); + }); + + it("strips audio by extension even without mime type (issue #4197)", () => { + const note = buildInboundMediaNote({ + MediaPaths: ["/tmp/voice_message.ogg", "/tmp/document.pdf"], + MediaUnderstanding: [ + { + kind: "audio.transcription", + attachmentIndex: 0, + text: "Transcribed audio content", + provider: "whisper", + }, + ], + }); + // Only PDF should remain, audio stripped by extension + expect(note).toBe("[media attached: /tmp/document.pdf]"); + }); + + it("keeps audio attachments when no transcription available", () => { + const note = buildInboundMediaNote({ + MediaPaths: ["/tmp/voice.ogg"], + MediaTypes: ["audio/ogg"], + }); + // No transcription = keep audio attachment as fallback + expect(note).toBe("[media attached: /tmp/voice.ogg (audio/ogg)]"); + }); }); diff --git a/src/auto-reply/media-note.ts b/src/auto-reply/media-note.ts index a34139fee06..7835988f56e 100644 --- a/src/auto-reply/media-note.ts +++ b/src/auto-reply/media-note.ts @@ -17,12 +17,45 @@ function formatMediaAttachedLine(params: { return `${prefix}${params.path}${typePart}${urlPart}]`; } +// Common audio file extensions for transcription detection +const AUDIO_EXTENSIONS = new Set([ + ".ogg", + ".opus", + ".mp3", + ".m4a", + ".wav", + ".webm", + ".flac", + ".aac", + ".wma", + ".aiff", + ".alac", + ".oga", +]); + +function isAudioPath(path: string | undefined): boolean { + if (!path) { + return false; + } + const lower = path.toLowerCase(); + for (const ext of AUDIO_EXTENSIONS) { + if (lower.endsWith(ext)) { + return true; + } + } + return false; +} + export function buildInboundMediaNote(ctx: MsgContext): string | undefined { // Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel. const suppressed = new Set(); + const transcribedAudioIndices = new Set(); if (Array.isArray(ctx.MediaUnderstanding)) { for (const output of ctx.MediaUnderstanding) { suppressed.add(output.attachmentIndex); + if (output.kind === "audio.transcription") { + transcribedAudioIndices.add(output.attachmentIndex); + } } } if (Array.isArray(ctx.MediaUnderstandingDecisions)) { @@ -33,6 +66,9 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined { for (const attachment of decision.attachments) { if (attachment.chosen?.outcome === "success") { suppressed.add(attachment.attachmentIndex); + if (decision.capability === "audio") { + transcribedAudioIndices.add(attachment.attachmentIndex); + } } } } @@ -56,6 +92,10 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined { Array.isArray(ctx.MediaTypes) && ctx.MediaTypes.length === paths.length ? ctx.MediaTypes : undefined; + const hasTranscript = Boolean(ctx.Transcript?.trim()); + // Transcript alone does not identify an attachment index; only use it as a fallback + // when there is a single attachment to avoid stripping unrelated audio files. + const canStripSingleAttachmentByTranscript = hasTranscript && paths.length === 1; const entries = paths .map((entry, index) => ({ @@ -64,7 +104,28 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined { url: urls?.[index] ?? ctx.MediaUrl, index, })) - .filter((entry) => !suppressed.has(entry.index)); + .filter((entry) => { + if (suppressed.has(entry.index)) { + return false; + } + // Strip audio attachments when transcription succeeded - the transcript is already + // available in the context, raw audio binary would only waste tokens (issue #4197) + // Note: Only trust MIME type from per-entry types array, not fallback ctx.MediaType + // which could misclassify non-audio attachments (greptile review feedback) + const hasPerEntryType = types !== undefined; + const isAudioByMime = hasPerEntryType && entry.type?.toLowerCase().startsWith("audio/"); + const isAudioEntry = isAudioPath(entry.path) || isAudioByMime; + if (!isAudioEntry) { + return true; + } + if ( + transcribedAudioIndices.has(entry.index) || + (canStripSingleAttachmentByTranscript && entry.index === 0) + ) { + return false; + } + return true; + }); if (entries.length === 0) { return undefined; }