From 7081dee1aff61e980bb8fc42c68ba3eb669a78cc Mon Sep 17 00:00:00 2001
From: Arkadiusz Mastalerz <aras88@gmail.com>
Date: Fri, 13 Feb 2026 02:01:53 +0100
Subject: [PATCH] fix(media): strip audio attachments after successful
 transcription (openclaw#9076) thanks @nobrainer-tech

Verified:
- pnpm install --frozen-lockfile
- pnpm build
- pnpm check
- pnpm test (fails in known unrelated telegram suite)
- pnpm vitest run src/auto-reply/media-note.test.ts src/auto-reply/reply.media-note.test.ts

Co-authored-by: nobrainer-tech <445466+nobrainer-tech@users.noreply.github.com>
---
 src/auto-reply/media-note.test.ts | 89 +++++++++++++++++++++++++++++++
 src/auto-reply/media-note.ts      | 63 +++++++++++++++++++++-
 2 files changed, 151 insertions(+), 1 deletion(-)

diff --git a/src/auto-reply/media-note.test.ts b/src/auto-reply/media-note.test.ts
index 5d9ae04cbcf..3eb357bff89 100644
--- a/src/auto-reply/media-note.test.ts
+++ b/src/auto-reply/media-note.test.ts
@@ -106,4 +106,93 @@ describe("buildInboundMediaNote", () => {
     });
     expect(note).toBe("[media attached: /tmp/b.png | https://example.com/b.png]");
   });
+
+  it("strips audio attachments when transcription succeeded via MediaUnderstanding (issue #4197)", () => {
+    const note = buildInboundMediaNote({
+      MediaPaths: ["/tmp/voice.ogg", "/tmp/image.png"],
+      MediaUrls: ["https://example.com/voice.ogg", "https://example.com/image.png"],
+      MediaTypes: ["audio/ogg", "image/png"],
+      MediaUnderstanding: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "Hello world",
+          provider: "whisper",
+        },
+      ],
+    });
+    // Audio attachment should be stripped (already transcribed), image should remain
+    expect(note).toBe(
+      "[media attached: /tmp/image.png (image/png) | https://example.com/image.png]",
+    );
+  });
+
+  it("only strips audio attachments that were transcribed", () => {
+    const note = buildInboundMediaNote({
+      MediaPaths: ["/tmp/voice-1.ogg", "/tmp/voice-2.ogg"],
+      MediaUrls: ["https://example.com/voice-1.ogg", "https://example.com/voice-2.ogg"],
+      MediaTypes: ["audio/ogg", "audio/ogg"],
+      MediaUnderstanding: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "First transcript",
+          provider: "whisper",
+        },
+      ],
+    });
+    expect(note).toBe(
+      "[media attached: /tmp/voice-2.ogg (audio/ogg) | https://example.com/voice-2.ogg]",
+    );
+  });
+
+  it("strips audio attachments when Transcript is present (issue #4197)", () => {
+    const note = buildInboundMediaNote({
+      MediaPaths: ["/tmp/voice.opus"],
+      MediaTypes: ["audio/opus"],
+      Transcript: "Hello world from Whisper",
+    });
+    // Audio should be stripped when transcript is available
+    expect(note).toBeUndefined();
+  });
+
+  it("does not strip multiple audio attachments using transcript-only fallback", () => {
+    const note = buildInboundMediaNote({
+      MediaPaths: ["/tmp/voice-1.ogg", "/tmp/voice-2.ogg"],
+      MediaTypes: ["audio/ogg", "audio/ogg"],
+      Transcript: "Transcript text without per-attachment mapping",
+    });
+    expect(note).toBe(
+      [
+        "[media attached: 2 files]",
+        "[media attached 1/2: /tmp/voice-1.ogg (audio/ogg)]",
+        "[media attached 2/2: /tmp/voice-2.ogg (audio/ogg)]",
+      ].join("\n"),
+    );
+  });
+
+  it("strips audio by extension even without mime type (issue #4197)", () => {
+    const note = buildInboundMediaNote({
+      MediaPaths: ["/tmp/voice_message.ogg", "/tmp/document.pdf"],
+      MediaUnderstanding: [
+        {
+          kind: "audio.transcription",
+          attachmentIndex: 0,
+          text: "Transcribed audio content",
+          provider: "whisper",
+        },
+      ],
+    });
+    // Only PDF should remain, audio stripped by extension
+    expect(note).toBe("[media attached: /tmp/document.pdf]");
+  });
+
+  it("keeps audio attachments when no transcription available", () => {
+    const note = buildInboundMediaNote({
+      MediaPaths: ["/tmp/voice.ogg"],
+      MediaTypes: ["audio/ogg"],
+    });
+    // No transcription = keep audio attachment as fallback
+    expect(note).toBe("[media attached: /tmp/voice.ogg (audio/ogg)]");
+  });
 });
diff --git a/src/auto-reply/media-note.ts b/src/auto-reply/media-note.ts
index a34139fee06..7835988f56e 100644
--- a/src/auto-reply/media-note.ts
+++ b/src/auto-reply/media-note.ts
@@ -17,12 +17,45 @@ function formatMediaAttachedLine(params: {
   return `${prefix}${params.path}${typePart}${urlPart}]`;
 }
 
+// Common audio file extensions for transcription detection
+const AUDIO_EXTENSIONS = new Set([
+  ".ogg",
+  ".opus",
+  ".mp3",
+  ".m4a",
+  ".wav",
+  ".webm",
+  ".flac",
+  ".aac",
+  ".wma",
+  ".aiff",
+  ".alac",
+  ".oga",
+]);
+
+function isAudioPath(path: string | undefined): boolean {
+  if (!path) {
+    return false;
+  }
+  const lower = path.toLowerCase();
+  for (const ext of AUDIO_EXTENSIONS) {
+    if (lower.endsWith(ext)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
   // Attachment indices follow MediaPaths/MediaUrls ordering as supplied by the channel.
   const suppressed = new Set<number>();
+  const transcribedAudioIndices = new Set<number>();
   if (Array.isArray(ctx.MediaUnderstanding)) {
     for (const output of ctx.MediaUnderstanding) {
       suppressed.add(output.attachmentIndex);
+      if (output.kind === "audio.transcription") {
+        transcribedAudioIndices.add(output.attachmentIndex);
+      }
     }
   }
   if (Array.isArray(ctx.MediaUnderstandingDecisions)) {
@@ -33,6 +66,9 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
       for (const attachment of decision.attachments) {
         if (attachment.chosen?.outcome === "success") {
           suppressed.add(attachment.attachmentIndex);
+          if (decision.capability === "audio") {
+            transcribedAudioIndices.add(attachment.attachmentIndex);
+          }
         }
       }
     }
@@ -56,6 +92,10 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
     Array.isArray(ctx.MediaTypes) && ctx.MediaTypes.length === paths.length
       ? ctx.MediaTypes
       : undefined;
+  const hasTranscript = Boolean(ctx.Transcript?.trim());
+  // Transcript alone does not identify an attachment index; only use it as a fallback
+  // when there is a single attachment to avoid stripping unrelated audio files.
+  const canStripSingleAttachmentByTranscript = hasTranscript && paths.length === 1;
 
   const entries = paths
     .map((entry, index) => ({
@@ -64,7 +104,28 @@ export function buildInboundMediaNote(ctx: MsgContext): string | undefined {
       url: urls?.[index] ?? ctx.MediaUrl,
       index,
     }))
-    .filter((entry) => !suppressed.has(entry.index));
+    .filter((entry) => {
+      if (suppressed.has(entry.index)) {
+        return false;
+      }
+      // Strip audio attachments when transcription succeeded - the transcript is already
+      // available in the context, raw audio binary would only waste tokens (issue #4197)
+      // Note: Only trust MIME type from per-entry types array, not fallback ctx.MediaType
+      // which could misclassify non-audio attachments (greptile review feedback)
+      const hasPerEntryType = types !== undefined;
+      const isAudioByMime = hasPerEntryType && entry.type?.toLowerCase().startsWith("audio/");
+      const isAudioEntry = isAudioPath(entry.path) || isAudioByMime;
+      if (!isAudioEntry) {
+        return true;
+      }
+      if (
+        transcribedAudioIndices.has(entry.index) ||
+        (canStripSingleAttachmentByTranscript && entry.index === 0)
+      ) {
+        return false;
+      }
+      return true;
+    });
   if (entries.length === 0) {
     return undefined;
   }