feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the transcription text is sent back to the originating chat immediately after successful audio transcription — before the agent processes it. This lets users verify what was heard from their voice note. Changes: - config/types.tools.ts: add echoTranscript (bool) and echoFormat (string template) to MediaUnderstandingConfig - media-understanding/apply.ts: sendTranscriptEcho() helper that resolves channel/to from ctx, guards on isDeliverableMessageChannel, and calls deliverOutboundPayloads best-effort - config/schema.help.ts: help text for both new fields - config/schema.labels.ts: labels for both new fields - media-understanding/apply.echo-transcript.test.ts: 10 vitest cases covering disabled/enabled/custom-format/no-audio/failed-transcription/ non-deliverable-channel/missing-from/OriginatingTo/delivery-failure Default echoFormat: '📝 "{transcript}"' Closes #32102
2026-05-08 20:28:29 +00:00 · 2026-03-02 23:31:57 +03:00
parent ef89b48785
commit 1b61269eec
5 changed files with 442 additions and 0 deletions
--- a/src/media-understanding/apply.echo-transcript.test.ts
+++ b/src/media-understanding/apply.echo-transcript.test.ts
@@ -0,0 +1,353 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import { afterAll, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
+import type { MsgContext } from "../auto-reply/templating.js";
+import type { OpenClawConfig } from "../config/config.js";
+import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
+
+// ---------------------------------------------------------------------------
+// Module mocks
+// ---------------------------------------------------------------------------
+
+vi.mock("../agents/model-auth.js", () => ({
+  resolveApiKeyForProvider: vi.fn(async () => ({
+    apiKey: "test-key",
+    source: "test",
+    mode: "api-key",
+  })),
+  requireApiKey: (auth: { apiKey?: string; mode?: string }, provider: string) => {
+    if (auth?.apiKey) {
+      return auth.apiKey;
+    }
+    throw new Error(`No API key resolved for provider "${provider}" (auth mode: ${auth?.mode}).`);
+  },
+  resolveAwsSdkEnvVarName: vi.fn(() => undefined),
+  resolveEnvApiKey: vi.fn(() => null),
+  resolveModelAuthMode: vi.fn(() => "api-key"),
+  getApiKeyForModel: vi.fn(async () => ({ apiKey: "test-key", source: "test", mode: "api-key" })),
+  getCustomProviderApiKey: vi.fn(() => undefined),
+  ensureAuthProfileStore: vi.fn(async () => ({})),
+  resolveAuthProfileOrder: vi.fn(() => []),
+}));
+
+class MediaFetchErrorMock extends Error {
+  code: string;
+  constructor(message: string, code: string) {
+    super(message);
+    this.name = "MediaFetchError";
+    this.code = code;
+  }
+}
+
+vi.mock("../media/fetch.js", () => ({
+  fetchRemoteMedia: vi.fn(),
+  MediaFetchError: MediaFetchErrorMock,
+}));
+
+vi.mock("../process/exec.js", () => ({
+  runExec: vi.fn(),
+  runCommandWithTimeout: vi.fn(),
+}));
+
+const mockDeliverOutboundPayloads = vi.fn();
+
+vi.mock("../infra/outbound/deliver.js", () => ({
+  deliverOutboundPayloads: (...args: unknown[]) => mockDeliverOutboundPayloads(...args),
+}));
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding;
+let clearMediaUnderstandingBinaryCacheForTests: () => void;
+
+const TEMP_MEDIA_PREFIX = "openclaw-echo-transcript-test-";
+let suiteTempMediaRootDir = "";
+
+async function createTempAudioFile(): Promise<string> {
+  const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "case-"));
+  const filePath = path.join(dir, "note.ogg");
+  await fs.writeFile(filePath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8]));
+  return filePath;
+}
+
+function createAudioCtxWithProvider(mediaPath: string, extra?: Partial<MsgContext>): MsgContext {
+  return {
+    Body: "<media:audio>",
+    MediaPath: mediaPath,
+    MediaType: "audio/ogg",
+    Provider: "whatsapp",
+    From: "+10000000001",
+    AccountId: "acc1",
+    ...extra,
+  };
+}
+
+function createAudioConfigWithEcho(opts?: {
+  echoTranscript?: boolean;
+  echoFormat?: string;
+  transcribedText?: string;
+}): {
+  cfg: OpenClawConfig;
+  providers: Record<string, { id: string; transcribeAudio: () => Promise<{ text: string }> }>;
+} {
+  const cfg: OpenClawConfig = {
+    tools: {
+      media: {
+        audio: {
+          enabled: true,
+          maxBytes: 1024 * 1024,
+          models: [{ provider: "groq" }],
+          echoTranscript: opts?.echoTranscript ?? true,
+          ...(opts?.echoFormat !== undefined ? { echoFormat: opts.echoFormat } : {}),
+        },
+      },
+    },
+  };
+  const providers = {
+    groq: {
+      id: "groq",
+      transcribeAudio: async () => ({ text: opts?.transcribedText ?? "hello world" }),
+    },
+  };
+  return { cfg, providers };
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+describe("applyMediaUnderstanding – echo transcript", () => {
+  beforeAll(async () => {
+    const baseDir = resolvePreferredOpenClawTmpDir();
+    await fs.mkdir(baseDir, { recursive: true });
+    suiteTempMediaRootDir = await fs.mkdtemp(path.join(baseDir, TEMP_MEDIA_PREFIX));
+    const mod = await import("./apply.js");
+    applyMediaUnderstanding = mod.applyMediaUnderstanding;
+    const runner = await import("./runner.js");
+    clearMediaUnderstandingBinaryCacheForTests = runner.clearMediaUnderstandingBinaryCacheForTests;
+  });
+
+  beforeEach(() => {
+    mockDeliverOutboundPayloads.mockClear();
+    mockDeliverOutboundPayloads.mockResolvedValue([{ channel: "whatsapp", messageId: "echo-1" }]);
+    clearMediaUnderstandingBinaryCacheForTests?.();
+  });
+
+  afterAll(async () => {
+    if (!suiteTempMediaRootDir) {
+      return;
+    }
+    await fs.rm(suiteTempMediaRootDir, { recursive: true, force: true });
+    suiteTempMediaRootDir = "";
+  });
+
+  it("does NOT echo when echoTranscript is false (default)", async () => {
+    const mediaPath = await createTempAudioFile();
+    const ctx = createAudioCtxWithProvider(mediaPath);
+    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: false });
+
+    await applyMediaUnderstanding({ ctx, cfg, providers });
+
+    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
+  });
+
+  it("does NOT echo when echoTranscript is absent (default)", async () => {
+    const mediaPath = await createTempAudioFile();
+    const ctx = createAudioCtxWithProvider(mediaPath);
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+            // echoTranscript not set → defaults to false
+          },
+        },
+      },
+    };
+    const providers = {
+      groq: { id: "groq", transcribeAudio: async () => ({ text: "hello world" }) },
+    };
+
+    await applyMediaUnderstanding({ ctx, cfg, providers });
+
+    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
+  });
+
+  it("echoes transcript with default format when echoTranscript is true", async () => {
+    const mediaPath = await createTempAudioFile();
+    const ctx = createAudioCtxWithProvider(mediaPath);
+    const { cfg, providers } = createAudioConfigWithEcho({
+      echoTranscript: true,
+      transcribedText: "hello world",
+    });
+
+    await applyMediaUnderstanding({ ctx, cfg, providers });
+
+    expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
+    const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0];
+    expect(callArgs).toBeDefined();
+    expect(callArgs.channel).toBe("whatsapp");
+    expect(callArgs.to).toBe("+10000000001");
+    expect(callArgs.accountId).toBe("acc1");
+    expect(callArgs.payloads).toHaveLength(1);
+    expect(callArgs.payloads[0].text).toBe('📝 "hello world"');
+  });
+
+  it("uses custom echoFormat when provided", async () => {
+    const mediaPath = await createTempAudioFile();
+    const ctx = createAudioCtxWithProvider(mediaPath);
+    const { cfg, providers } = createAudioConfigWithEcho({
+      echoTranscript: true,
+      echoFormat: "🎙️ Heard: {transcript}",
+      transcribedText: "custom message",
+    });
+
+    await applyMediaUnderstanding({ ctx, cfg, providers });
+
+    expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
+    const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0];
+    expect(callArgs?.payloads[0].text).toBe("🎙️ Heard: custom message");
+  });
+
+  it("does NOT echo when there are no audio attachments", async () => {
+    // Image-only context — no audio attachment
+    const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "img-"));
+    const imgPath = path.join(dir, "photo.jpg");
+    await fs.writeFile(imgPath, Buffer.from([0xff, 0xd8, 0xff, 0xe0]));
+
+    const ctx: MsgContext = {
+      Body: "<media:image>",
+      MediaPath: imgPath,
+      MediaType: "image/jpeg",
+      Provider: "whatsapp",
+      From: "+10000000001",
+    };
+
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+            echoTranscript: true,
+          },
+          image: { enabled: false },
+        },
+      },
+    };
+    const providers = {
+      groq: { id: "groq", transcribeAudio: async () => ({ text: "should not appear" }) },
+    };
+
+    await applyMediaUnderstanding({ ctx, cfg, providers });
+
+    // No audio outputs → Transcript not set → no echo
+    expect(ctx.Transcript).toBeUndefined();
+    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
+  });
+
+  it("does NOT echo when transcription fails", async () => {
+    const mediaPath = await createTempAudioFile();
+    const ctx = createAudioCtxWithProvider(mediaPath);
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            maxBytes: 1024 * 1024,
+            models: [{ provider: "groq" }],
+            echoTranscript: true,
+          },
+        },
+      },
+    };
+    const providers = {
+      groq: {
+        id: "groq",
+        transcribeAudio: async () => {
+          throw new Error("transcription provider failure");
+        },
+      },
+    };
+
+    // Should not throw; transcription failure is swallowed by runner
+    await applyMediaUnderstanding({ ctx, cfg, providers });
+
+    expect(ctx.Transcript).toBeUndefined();
+    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
+  });
+
+  it("does NOT echo when channel is not deliverable", async () => {
+    const mediaPath = await createTempAudioFile();
+    // Use an internal/non-deliverable channel
+    const ctx = createAudioCtxWithProvider(mediaPath, {
+      Provider: "internal-system",
+      From: "some-source",
+    });
+    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
+
+    await applyMediaUnderstanding({ ctx, cfg, providers });
+
+    // Transcript should be set (transcription succeeded)
+    expect(ctx.Transcript).toBe("hello world");
+    // But echo should be skipped
+    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
+  });
+
+  it("does NOT echo when ctx has no From or OriginatingTo", async () => {
+    const mediaPath = await createTempAudioFile();
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: mediaPath,
+      MediaType: "audio/ogg",
+      Provider: "whatsapp",
+      // From and OriginatingTo intentionally absent
+    };
+    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
+
+    await applyMediaUnderstanding({ ctx, cfg, providers });
+
+    expect(ctx.Transcript).toBe("hello world");
+    expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
+  });
+
+  it("uses OriginatingTo when From is absent", async () => {
+    const mediaPath = await createTempAudioFile();
+    const ctx: MsgContext = {
+      Body: "<media:audio>",
+      MediaPath: mediaPath,
+      MediaType: "audio/ogg",
+      Provider: "whatsapp",
+      OriginatingTo: "+19999999999",
+    };
+    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
+
+    await applyMediaUnderstanding({ ctx, cfg, providers });
+
+    expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
+    const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0];
+    expect(callArgs?.to).toBe("+19999999999");
+  });
+
+  it("echo delivery failure does not throw or break transcription", async () => {
+    const mediaPath = await createTempAudioFile();
+    const ctx = createAudioCtxWithProvider(mediaPath);
+    const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
+
+    mockDeliverOutboundPayloads.mockRejectedValueOnce(new Error("delivery timeout"));
+
+    // Should not throw
+    const result = await applyMediaUnderstanding({ ctx, cfg, providers });
+
+    // Transcription itself succeeded
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("hello world");
+    // Deliver was attempted
+    expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
+  });
+});
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -8,6 +8,7 @@ import {
  normalizeMimeType,
  resolveInputFileLimits,
 } from "../media/input-files.js";
+import { isDeliverableMessageChannel } from "../utils/message-channel.js";
 import { resolveAttachmentKind } from "./attachments.js";
 import { runWithConcurrency } from "./concurrency.js";
 import {
@@ -462,6 +463,68 @@ async function extractFileBlocks(params: {
  return blocks;
 }

+const DEFAULT_ECHO_FORMAT = '📝 "{transcript}"';
+
+/**
+ * Formats a transcript echo message using the configured format string.
+ * Replaces `{transcript}` placeholder with the actual transcript text.
+ */
+function formatEchoTranscript(transcript: string, format: string): string {
+  return format.replace("{transcript}", transcript);
+}
+
+/**
+ * Sends the transcript echo back to the originating chat.
+ * Best-effort: logs on failure, never throws.
+ */
+async function sendTranscriptEcho(params: {
+  ctx: MsgContext;
+  cfg: OpenClawConfig;
+  transcript: string;
+  format: string;
+}): Promise<void> {
+  const { ctx, cfg, transcript, format } = params;
+  const channel = ctx.Provider ?? ctx.Surface ?? "";
+  const to = ctx.OriginatingTo ?? ctx.From ?? "";
+
+  if (!channel || !to) {
+    if (shouldLogVerbose()) {
+      logVerbose("media: echo-transcript skipped (no channel/to resolved from ctx)");
+    }
+    return;
+  }
+
+  const normalizedChannel = channel.trim().toLowerCase();
+  if (!isDeliverableMessageChannel(normalizedChannel)) {
+    if (shouldLogVerbose()) {
+      logVerbose(
+        `media: echo-transcript skipped (channel "${String(normalizedChannel)}" is not deliverable)`,
+      );
+    }
+    return;
+  }
+
+  const text = formatEchoTranscript(transcript, format);
+
+  try {
+    const { deliverOutboundPayloads } = await import("../infra/outbound/deliver.js");
+    await deliverOutboundPayloads({
+      cfg,
+      channel: normalizedChannel,
+      to,
+      accountId: ctx.AccountId ?? undefined,
+      threadId: ctx.MessageThreadId ?? undefined,
+      payloads: [{ text }],
+      bestEffort: true,
+    });
+    if (shouldLogVerbose()) {
+      logVerbose(`media: echo-transcript sent to ${normalizedChannel}/${to}`);
+    }
+  } catch (err) {
+    logVerbose(`media: echo-transcript delivery failed: ${String(err)}`);
+  }
+}
+
 export async function applyMediaUnderstanding(params: {
  ctx: MsgContext;
  cfg: OpenClawConfig;
@@ -528,6 +591,16 @@ export async function applyMediaUnderstanding(params: {
          ctx.CommandBody = transcript;
          ctx.RawBody = transcript;
        }
+        // Echo transcript back to chat before agent processing, if configured.
+        const audioCfg = cfg.tools?.media?.audio;
+        if (audioCfg?.echoTranscript && transcript) {
+          await sendTranscriptEcho({
+            ctx,
+            cfg,
+            transcript,
+            format: audioCfg.echoFormat ?? DEFAULT_ECHO_FORMAT,
+          });
+        }
      } else if (originalUserText) {
        ctx.CommandBody = originalUserText;
        ctx.RawBody = originalUserText;