feat(audio): auto-echo transcription to chat before agent processing

When echoTranscript is enabled in tools.media.audio config, the
transcription text is sent back to the originating chat immediately
after successful audio transcription — before the agent processes it.
This lets users verify what was heard from their voice note.

Changes:
- config/types.tools.ts: add echoTranscript (bool) and echoFormat
  (string template) to MediaUnderstandingConfig
- media-understanding/apply.ts: sendTranscriptEcho() helper that
  resolves channel/to from ctx, guards on isDeliverableMessageChannel,
  and calls deliverOutboundPayloads best-effort
- config/schema.help.ts: help text for both new fields
- config/schema.labels.ts: labels for both new fields
- media-understanding/apply.echo-transcript.test.ts: 10 vitest cases
  covering disabled/enabled/custom-format/no-audio/failed-transcription/
  non-deliverable-channel/missing-from/OriginatingTo/delivery-failure

Default echoFormat: '📝 "{transcript}"'

Closes #32102
This commit is contained in:
AytuncYildizli
2026-03-02 23:31:57 +03:00
committed by Peter Steinberger
parent ef89b48785
commit 1b61269eec
5 changed files with 442 additions and 0 deletions

View File

@@ -0,0 +1,353 @@
import fs from "node:fs/promises";
import path from "node:path";
import { afterAll, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
import type { MsgContext } from "../auto-reply/templating.js";
import type { OpenClawConfig } from "../config/config.js";
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
// ---------------------------------------------------------------------------
// Module mocks
// ---------------------------------------------------------------------------
vi.mock("../agents/model-auth.js", () => ({
resolveApiKeyForProvider: vi.fn(async () => ({
apiKey: "test-key",
source: "test",
mode: "api-key",
})),
requireApiKey: (auth: { apiKey?: string; mode?: string }, provider: string) => {
if (auth?.apiKey) {
return auth.apiKey;
}
throw new Error(`No API key resolved for provider "${provider}" (auth mode: ${auth?.mode}).`);
},
resolveAwsSdkEnvVarName: vi.fn(() => undefined),
resolveEnvApiKey: vi.fn(() => null),
resolveModelAuthMode: vi.fn(() => "api-key"),
getApiKeyForModel: vi.fn(async () => ({ apiKey: "test-key", source: "test", mode: "api-key" })),
getCustomProviderApiKey: vi.fn(() => undefined),
ensureAuthProfileStore: vi.fn(async () => ({})),
resolveAuthProfileOrder: vi.fn(() => []),
}));
class MediaFetchErrorMock extends Error {
code: string;
constructor(message: string, code: string) {
super(message);
this.name = "MediaFetchError";
this.code = code;
}
}
vi.mock("../media/fetch.js", () => ({
fetchRemoteMedia: vi.fn(),
MediaFetchError: MediaFetchErrorMock,
}));
vi.mock("../process/exec.js", () => ({
runExec: vi.fn(),
runCommandWithTimeout: vi.fn(),
}));
const mockDeliverOutboundPayloads = vi.fn();
vi.mock("../infra/outbound/deliver.js", () => ({
deliverOutboundPayloads: (...args: unknown[]) => mockDeliverOutboundPayloads(...args),
}));
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding;
let clearMediaUnderstandingBinaryCacheForTests: () => void;
const TEMP_MEDIA_PREFIX = "openclaw-echo-transcript-test-";
let suiteTempMediaRootDir = "";
async function createTempAudioFile(): Promise<string> {
const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "case-"));
const filePath = path.join(dir, "note.ogg");
await fs.writeFile(filePath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8]));
return filePath;
}
function createAudioCtxWithProvider(mediaPath: string, extra?: Partial<MsgContext>): MsgContext {
return {
Body: "<media:audio>",
MediaPath: mediaPath,
MediaType: "audio/ogg",
Provider: "whatsapp",
From: "+10000000001",
AccountId: "acc1",
...extra,
};
}
function createAudioConfigWithEcho(opts?: {
echoTranscript?: boolean;
echoFormat?: string;
transcribedText?: string;
}): {
cfg: OpenClawConfig;
providers: Record<string, { id: string; transcribeAudio: () => Promise<{ text: string }> }>;
} {
const cfg: OpenClawConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 1024 * 1024,
models: [{ provider: "groq" }],
echoTranscript: opts?.echoTranscript ?? true,
...(opts?.echoFormat !== undefined ? { echoFormat: opts.echoFormat } : {}),
},
},
},
};
const providers = {
groq: {
id: "groq",
transcribeAudio: async () => ({ text: opts?.transcribedText ?? "hello world" }),
},
};
return { cfg, providers };
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
describe("applyMediaUnderstanding echo transcript", () => {
beforeAll(async () => {
const baseDir = resolvePreferredOpenClawTmpDir();
await fs.mkdir(baseDir, { recursive: true });
suiteTempMediaRootDir = await fs.mkdtemp(path.join(baseDir, TEMP_MEDIA_PREFIX));
const mod = await import("./apply.js");
applyMediaUnderstanding = mod.applyMediaUnderstanding;
const runner = await import("./runner.js");
clearMediaUnderstandingBinaryCacheForTests = runner.clearMediaUnderstandingBinaryCacheForTests;
});
beforeEach(() => {
mockDeliverOutboundPayloads.mockClear();
mockDeliverOutboundPayloads.mockResolvedValue([{ channel: "whatsapp", messageId: "echo-1" }]);
clearMediaUnderstandingBinaryCacheForTests?.();
});
afterAll(async () => {
if (!suiteTempMediaRootDir) {
return;
}
await fs.rm(suiteTempMediaRootDir, { recursive: true, force: true });
suiteTempMediaRootDir = "";
});
it("does NOT echo when echoTranscript is false (default)", async () => {
const mediaPath = await createTempAudioFile();
const ctx = createAudioCtxWithProvider(mediaPath);
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: false });
await applyMediaUnderstanding({ ctx, cfg, providers });
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
});
it("does NOT echo when echoTranscript is absent (default)", async () => {
const mediaPath = await createTempAudioFile();
const ctx = createAudioCtxWithProvider(mediaPath);
const cfg: OpenClawConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 1024 * 1024,
models: [{ provider: "groq" }],
// echoTranscript not set → defaults to false
},
},
},
};
const providers = {
groq: { id: "groq", transcribeAudio: async () => ({ text: "hello world" }) },
};
await applyMediaUnderstanding({ ctx, cfg, providers });
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
});
it("echoes transcript with default format when echoTranscript is true", async () => {
const mediaPath = await createTempAudioFile();
const ctx = createAudioCtxWithProvider(mediaPath);
const { cfg, providers } = createAudioConfigWithEcho({
echoTranscript: true,
transcribedText: "hello world",
});
await applyMediaUnderstanding({ ctx, cfg, providers });
expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0];
expect(callArgs).toBeDefined();
expect(callArgs.channel).toBe("whatsapp");
expect(callArgs.to).toBe("+10000000001");
expect(callArgs.accountId).toBe("acc1");
expect(callArgs.payloads).toHaveLength(1);
expect(callArgs.payloads[0].text).toBe('📝 "hello world"');
});
it("uses custom echoFormat when provided", async () => {
const mediaPath = await createTempAudioFile();
const ctx = createAudioCtxWithProvider(mediaPath);
const { cfg, providers } = createAudioConfigWithEcho({
echoTranscript: true,
echoFormat: "🎙️ Heard: {transcript}",
transcribedText: "custom message",
});
await applyMediaUnderstanding({ ctx, cfg, providers });
expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0];
expect(callArgs?.payloads[0].text).toBe("🎙️ Heard: custom message");
});
it("does NOT echo when there are no audio attachments", async () => {
// Image-only context — no audio attachment
const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "img-"));
const imgPath = path.join(dir, "photo.jpg");
await fs.writeFile(imgPath, Buffer.from([0xff, 0xd8, 0xff, 0xe0]));
const ctx: MsgContext = {
Body: "<media:image>",
MediaPath: imgPath,
MediaType: "image/jpeg",
Provider: "whatsapp",
From: "+10000000001",
};
const cfg: OpenClawConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 1024 * 1024,
models: [{ provider: "groq" }],
echoTranscript: true,
},
image: { enabled: false },
},
},
};
const providers = {
groq: { id: "groq", transcribeAudio: async () => ({ text: "should not appear" }) },
};
await applyMediaUnderstanding({ ctx, cfg, providers });
// No audio outputs → Transcript not set → no echo
expect(ctx.Transcript).toBeUndefined();
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
});
it("does NOT echo when transcription fails", async () => {
const mediaPath = await createTempAudioFile();
const ctx = createAudioCtxWithProvider(mediaPath);
const cfg: OpenClawConfig = {
tools: {
media: {
audio: {
enabled: true,
maxBytes: 1024 * 1024,
models: [{ provider: "groq" }],
echoTranscript: true,
},
},
},
};
const providers = {
groq: {
id: "groq",
transcribeAudio: async () => {
throw new Error("transcription provider failure");
},
},
};
// Should not throw; transcription failure is swallowed by runner
await applyMediaUnderstanding({ ctx, cfg, providers });
expect(ctx.Transcript).toBeUndefined();
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
});
it("does NOT echo when channel is not deliverable", async () => {
const mediaPath = await createTempAudioFile();
// Use an internal/non-deliverable channel
const ctx = createAudioCtxWithProvider(mediaPath, {
Provider: "internal-system",
From: "some-source",
});
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
await applyMediaUnderstanding({ ctx, cfg, providers });
// Transcript should be set (transcription succeeded)
expect(ctx.Transcript).toBe("hello world");
// But echo should be skipped
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
});
it("does NOT echo when ctx has no From or OriginatingTo", async () => {
const mediaPath = await createTempAudioFile();
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPath: mediaPath,
MediaType: "audio/ogg",
Provider: "whatsapp",
// From and OriginatingTo intentionally absent
};
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
await applyMediaUnderstanding({ ctx, cfg, providers });
expect(ctx.Transcript).toBe("hello world");
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
});
it("uses OriginatingTo when From is absent", async () => {
const mediaPath = await createTempAudioFile();
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPath: mediaPath,
MediaType: "audio/ogg",
Provider: "whatsapp",
OriginatingTo: "+19999999999",
};
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
await applyMediaUnderstanding({ ctx, cfg, providers });
expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0];
expect(callArgs?.to).toBe("+19999999999");
});
it("echo delivery failure does not throw or break transcription", async () => {
const mediaPath = await createTempAudioFile();
const ctx = createAudioCtxWithProvider(mediaPath);
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
mockDeliverOutboundPayloads.mockRejectedValueOnce(new Error("delivery timeout"));
// Should not throw
const result = await applyMediaUnderstanding({ ctx, cfg, providers });
// Transcription itself succeeded
expect(result.appliedAudio).toBe(true);
expect(ctx.Transcript).toBe("hello world");
// Deliver was attempted
expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
});
});

View File

@@ -8,6 +8,7 @@ import {
normalizeMimeType,
resolveInputFileLimits,
} from "../media/input-files.js";
import { isDeliverableMessageChannel } from "../utils/message-channel.js";
import { resolveAttachmentKind } from "./attachments.js";
import { runWithConcurrency } from "./concurrency.js";
import {
@@ -462,6 +463,68 @@ async function extractFileBlocks(params: {
return blocks;
}
const DEFAULT_ECHO_FORMAT = '📝 "{transcript}"';
/**
* Formats a transcript echo message using the configured format string.
* Replaces `{transcript}` placeholder with the actual transcript text.
*/
function formatEchoTranscript(transcript: string, format: string): string {
return format.replace("{transcript}", transcript);
}
/**
* Sends the transcript echo back to the originating chat.
* Best-effort: logs on failure, never throws.
*/
async function sendTranscriptEcho(params: {
ctx: MsgContext;
cfg: OpenClawConfig;
transcript: string;
format: string;
}): Promise<void> {
const { ctx, cfg, transcript, format } = params;
const channel = ctx.Provider ?? ctx.Surface ?? "";
const to = ctx.OriginatingTo ?? ctx.From ?? "";
if (!channel || !to) {
if (shouldLogVerbose()) {
logVerbose("media: echo-transcript skipped (no channel/to resolved from ctx)");
}
return;
}
const normalizedChannel = channel.trim().toLowerCase();
if (!isDeliverableMessageChannel(normalizedChannel)) {
if (shouldLogVerbose()) {
logVerbose(
`media: echo-transcript skipped (channel "${String(normalizedChannel)}" is not deliverable)`,
);
}
return;
}
const text = formatEchoTranscript(transcript, format);
try {
const { deliverOutboundPayloads } = await import("../infra/outbound/deliver.js");
await deliverOutboundPayloads({
cfg,
channel: normalizedChannel,
to,
accountId: ctx.AccountId ?? undefined,
threadId: ctx.MessageThreadId ?? undefined,
payloads: [{ text }],
bestEffort: true,
});
if (shouldLogVerbose()) {
logVerbose(`media: echo-transcript sent to ${normalizedChannel}/${to}`);
}
} catch (err) {
logVerbose(`media: echo-transcript delivery failed: ${String(err)}`);
}
}
export async function applyMediaUnderstanding(params: {
ctx: MsgContext;
cfg: OpenClawConfig;
@@ -528,6 +591,16 @@ export async function applyMediaUnderstanding(params: {
ctx.CommandBody = transcript;
ctx.RawBody = transcript;
}
// Echo transcript back to chat before agent processing, if configured.
const audioCfg = cfg.tools?.media?.audio;
if (audioCfg?.echoTranscript && transcript) {
await sendTranscriptEcho({
ctx,
cfg,
transcript,
format: audioCfg.echoFormat ?? DEFAULT_ECHO_FORMAT,
});
}
} else if (originalUserText) {
ctx.CommandBody = originalUserText;
ctx.RawBody = originalUserText;