mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-08 20:28:29 +00:00
feat(audio): auto-echo transcription to chat before agent processing
When echoTranscript is enabled in tools.media.audio config, the transcription text is sent back to the originating chat immediately after successful audio transcription — before the agent processes it. This lets users verify what was heard from their voice note. Changes: - config/types.tools.ts: add echoTranscript (bool) and echoFormat (string template) to MediaUnderstandingConfig - media-understanding/apply.ts: sendTranscriptEcho() helper that resolves channel/to from ctx, guards on isDeliverableMessageChannel, and calls deliverOutboundPayloads best-effort - config/schema.help.ts: help text for both new fields - config/schema.labels.ts: labels for both new fields - media-understanding/apply.echo-transcript.test.ts: 10 vitest cases covering disabled/enabled/custom-format/no-audio/failed-transcription/ non-deliverable-channel/missing-from/OriginatingTo/delivery-failure Default echoFormat: '📝 "{transcript}"' Closes #32102
This commit is contained in:
committed by
Peter Steinberger
parent
ef89b48785
commit
1b61269eec
353
src/media-understanding/apply.echo-transcript.test.ts
Normal file
353
src/media-understanding/apply.echo-transcript.test.ts
Normal file
@@ -0,0 +1,353 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { afterAll, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import type { OpenClawConfig } from "../config/config.js";
|
||||
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Module mocks
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
vi.mock("../agents/model-auth.js", () => ({
|
||||
resolveApiKeyForProvider: vi.fn(async () => ({
|
||||
apiKey: "test-key",
|
||||
source: "test",
|
||||
mode: "api-key",
|
||||
})),
|
||||
requireApiKey: (auth: { apiKey?: string; mode?: string }, provider: string) => {
|
||||
if (auth?.apiKey) {
|
||||
return auth.apiKey;
|
||||
}
|
||||
throw new Error(`No API key resolved for provider "${provider}" (auth mode: ${auth?.mode}).`);
|
||||
},
|
||||
resolveAwsSdkEnvVarName: vi.fn(() => undefined),
|
||||
resolveEnvApiKey: vi.fn(() => null),
|
||||
resolveModelAuthMode: vi.fn(() => "api-key"),
|
||||
getApiKeyForModel: vi.fn(async () => ({ apiKey: "test-key", source: "test", mode: "api-key" })),
|
||||
getCustomProviderApiKey: vi.fn(() => undefined),
|
||||
ensureAuthProfileStore: vi.fn(async () => ({})),
|
||||
resolveAuthProfileOrder: vi.fn(() => []),
|
||||
}));
|
||||
|
||||
class MediaFetchErrorMock extends Error {
|
||||
code: string;
|
||||
constructor(message: string, code: string) {
|
||||
super(message);
|
||||
this.name = "MediaFetchError";
|
||||
this.code = code;
|
||||
}
|
||||
}
|
||||
|
||||
vi.mock("../media/fetch.js", () => ({
|
||||
fetchRemoteMedia: vi.fn(),
|
||||
MediaFetchError: MediaFetchErrorMock,
|
||||
}));
|
||||
|
||||
vi.mock("../process/exec.js", () => ({
|
||||
runExec: vi.fn(),
|
||||
runCommandWithTimeout: vi.fn(),
|
||||
}));
|
||||
|
||||
const mockDeliverOutboundPayloads = vi.fn();
|
||||
|
||||
vi.mock("../infra/outbound/deliver.js", () => ({
|
||||
deliverOutboundPayloads: (...args: unknown[]) => mockDeliverOutboundPayloads(...args),
|
||||
}));
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding;
|
||||
let clearMediaUnderstandingBinaryCacheForTests: () => void;
|
||||
|
||||
const TEMP_MEDIA_PREFIX = "openclaw-echo-transcript-test-";
|
||||
let suiteTempMediaRootDir = "";
|
||||
|
||||
async function createTempAudioFile(): Promise<string> {
|
||||
const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "case-"));
|
||||
const filePath = path.join(dir, "note.ogg");
|
||||
await fs.writeFile(filePath, Buffer.from([0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8]));
|
||||
return filePath;
|
||||
}
|
||||
|
||||
function createAudioCtxWithProvider(mediaPath: string, extra?: Partial<MsgContext>): MsgContext {
|
||||
return {
|
||||
Body: "<media:audio>",
|
||||
MediaPath: mediaPath,
|
||||
MediaType: "audio/ogg",
|
||||
Provider: "whatsapp",
|
||||
From: "+10000000001",
|
||||
AccountId: "acc1",
|
||||
...extra,
|
||||
};
|
||||
}
|
||||
|
||||
function createAudioConfigWithEcho(opts?: {
|
||||
echoTranscript?: boolean;
|
||||
echoFormat?: string;
|
||||
transcribedText?: string;
|
||||
}): {
|
||||
cfg: OpenClawConfig;
|
||||
providers: Record<string, { id: string; transcribeAudio: () => Promise<{ text: string }> }>;
|
||||
} {
|
||||
const cfg: OpenClawConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
maxBytes: 1024 * 1024,
|
||||
models: [{ provider: "groq" }],
|
||||
echoTranscript: opts?.echoTranscript ?? true,
|
||||
...(opts?.echoFormat !== undefined ? { echoFormat: opts.echoFormat } : {}),
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
const providers = {
|
||||
groq: {
|
||||
id: "groq",
|
||||
transcribeAudio: async () => ({ text: opts?.transcribedText ?? "hello world" }),
|
||||
},
|
||||
};
|
||||
return { cfg, providers };
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe("applyMediaUnderstanding – echo transcript", () => {
|
||||
beforeAll(async () => {
|
||||
const baseDir = resolvePreferredOpenClawTmpDir();
|
||||
await fs.mkdir(baseDir, { recursive: true });
|
||||
suiteTempMediaRootDir = await fs.mkdtemp(path.join(baseDir, TEMP_MEDIA_PREFIX));
|
||||
const mod = await import("./apply.js");
|
||||
applyMediaUnderstanding = mod.applyMediaUnderstanding;
|
||||
const runner = await import("./runner.js");
|
||||
clearMediaUnderstandingBinaryCacheForTests = runner.clearMediaUnderstandingBinaryCacheForTests;
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
mockDeliverOutboundPayloads.mockClear();
|
||||
mockDeliverOutboundPayloads.mockResolvedValue([{ channel: "whatsapp", messageId: "echo-1" }]);
|
||||
clearMediaUnderstandingBinaryCacheForTests?.();
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (!suiteTempMediaRootDir) {
|
||||
return;
|
||||
}
|
||||
await fs.rm(suiteTempMediaRootDir, { recursive: true, force: true });
|
||||
suiteTempMediaRootDir = "";
|
||||
});
|
||||
|
||||
it("does NOT echo when echoTranscript is false (default)", async () => {
|
||||
const mediaPath = await createTempAudioFile();
|
||||
const ctx = createAudioCtxWithProvider(mediaPath);
|
||||
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: false });
|
||||
|
||||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||||
|
||||
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does NOT echo when echoTranscript is absent (default)", async () => {
|
||||
const mediaPath = await createTempAudioFile();
|
||||
const ctx = createAudioCtxWithProvider(mediaPath);
|
||||
const cfg: OpenClawConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
maxBytes: 1024 * 1024,
|
||||
models: [{ provider: "groq" }],
|
||||
// echoTranscript not set → defaults to false
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
const providers = {
|
||||
groq: { id: "groq", transcribeAudio: async () => ({ text: "hello world" }) },
|
||||
};
|
||||
|
||||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||||
|
||||
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("echoes transcript with default format when echoTranscript is true", async () => {
|
||||
const mediaPath = await createTempAudioFile();
|
||||
const ctx = createAudioCtxWithProvider(mediaPath);
|
||||
const { cfg, providers } = createAudioConfigWithEcho({
|
||||
echoTranscript: true,
|
||||
transcribedText: "hello world",
|
||||
});
|
||||
|
||||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||||
|
||||
expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
|
||||
const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0];
|
||||
expect(callArgs).toBeDefined();
|
||||
expect(callArgs.channel).toBe("whatsapp");
|
||||
expect(callArgs.to).toBe("+10000000001");
|
||||
expect(callArgs.accountId).toBe("acc1");
|
||||
expect(callArgs.payloads).toHaveLength(1);
|
||||
expect(callArgs.payloads[0].text).toBe('📝 "hello world"');
|
||||
});
|
||||
|
||||
it("uses custom echoFormat when provided", async () => {
|
||||
const mediaPath = await createTempAudioFile();
|
||||
const ctx = createAudioCtxWithProvider(mediaPath);
|
||||
const { cfg, providers } = createAudioConfigWithEcho({
|
||||
echoTranscript: true,
|
||||
echoFormat: "🎙️ Heard: {transcript}",
|
||||
transcribedText: "custom message",
|
||||
});
|
||||
|
||||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||||
|
||||
expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
|
||||
const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0];
|
||||
expect(callArgs?.payloads[0].text).toBe("🎙️ Heard: custom message");
|
||||
});
|
||||
|
||||
it("does NOT echo when there are no audio attachments", async () => {
|
||||
// Image-only context — no audio attachment
|
||||
const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "img-"));
|
||||
const imgPath = path.join(dir, "photo.jpg");
|
||||
await fs.writeFile(imgPath, Buffer.from([0xff, 0xd8, 0xff, 0xe0]));
|
||||
|
||||
const ctx: MsgContext = {
|
||||
Body: "<media:image>",
|
||||
MediaPath: imgPath,
|
||||
MediaType: "image/jpeg",
|
||||
Provider: "whatsapp",
|
||||
From: "+10000000001",
|
||||
};
|
||||
|
||||
const cfg: OpenClawConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
maxBytes: 1024 * 1024,
|
||||
models: [{ provider: "groq" }],
|
||||
echoTranscript: true,
|
||||
},
|
||||
image: { enabled: false },
|
||||
},
|
||||
},
|
||||
};
|
||||
const providers = {
|
||||
groq: { id: "groq", transcribeAudio: async () => ({ text: "should not appear" }) },
|
||||
};
|
||||
|
||||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||||
|
||||
// No audio outputs → Transcript not set → no echo
|
||||
expect(ctx.Transcript).toBeUndefined();
|
||||
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does NOT echo when transcription fails", async () => {
|
||||
const mediaPath = await createTempAudioFile();
|
||||
const ctx = createAudioCtxWithProvider(mediaPath);
|
||||
const cfg: OpenClawConfig = {
|
||||
tools: {
|
||||
media: {
|
||||
audio: {
|
||||
enabled: true,
|
||||
maxBytes: 1024 * 1024,
|
||||
models: [{ provider: "groq" }],
|
||||
echoTranscript: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
const providers = {
|
||||
groq: {
|
||||
id: "groq",
|
||||
transcribeAudio: async () => {
|
||||
throw new Error("transcription provider failure");
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
// Should not throw; transcription failure is swallowed by runner
|
||||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||||
|
||||
expect(ctx.Transcript).toBeUndefined();
|
||||
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does NOT echo when channel is not deliverable", async () => {
|
||||
const mediaPath = await createTempAudioFile();
|
||||
// Use an internal/non-deliverable channel
|
||||
const ctx = createAudioCtxWithProvider(mediaPath, {
|
||||
Provider: "internal-system",
|
||||
From: "some-source",
|
||||
});
|
||||
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
|
||||
|
||||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||||
|
||||
// Transcript should be set (transcription succeeded)
|
||||
expect(ctx.Transcript).toBe("hello world");
|
||||
// But echo should be skipped
|
||||
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("does NOT echo when ctx has no From or OriginatingTo", async () => {
|
||||
const mediaPath = await createTempAudioFile();
|
||||
const ctx: MsgContext = {
|
||||
Body: "<media:audio>",
|
||||
MediaPath: mediaPath,
|
||||
MediaType: "audio/ogg",
|
||||
Provider: "whatsapp",
|
||||
// From and OriginatingTo intentionally absent
|
||||
};
|
||||
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
|
||||
|
||||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||||
|
||||
expect(ctx.Transcript).toBe("hello world");
|
||||
expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("uses OriginatingTo when From is absent", async () => {
|
||||
const mediaPath = await createTempAudioFile();
|
||||
const ctx: MsgContext = {
|
||||
Body: "<media:audio>",
|
||||
MediaPath: mediaPath,
|
||||
MediaType: "audio/ogg",
|
||||
Provider: "whatsapp",
|
||||
OriginatingTo: "+19999999999",
|
||||
};
|
||||
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
|
||||
|
||||
await applyMediaUnderstanding({ ctx, cfg, providers });
|
||||
|
||||
expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
|
||||
const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0];
|
||||
expect(callArgs?.to).toBe("+19999999999");
|
||||
});
|
||||
|
||||
it("echo delivery failure does not throw or break transcription", async () => {
|
||||
const mediaPath = await createTempAudioFile();
|
||||
const ctx = createAudioCtxWithProvider(mediaPath);
|
||||
const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true });
|
||||
|
||||
mockDeliverOutboundPayloads.mockRejectedValueOnce(new Error("delivery timeout"));
|
||||
|
||||
// Should not throw
|
||||
const result = await applyMediaUnderstanding({ ctx, cfg, providers });
|
||||
|
||||
// Transcription itself succeeded
|
||||
expect(result.appliedAudio).toBe(true);
|
||||
expect(ctx.Transcript).toBe("hello world");
|
||||
// Deliver was attempted
|
||||
expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce();
|
||||
});
|
||||
});
|
||||
@@ -8,6 +8,7 @@ import {
|
||||
normalizeMimeType,
|
||||
resolveInputFileLimits,
|
||||
} from "../media/input-files.js";
|
||||
import { isDeliverableMessageChannel } from "../utils/message-channel.js";
|
||||
import { resolveAttachmentKind } from "./attachments.js";
|
||||
import { runWithConcurrency } from "./concurrency.js";
|
||||
import {
|
||||
@@ -462,6 +463,68 @@ async function extractFileBlocks(params: {
|
||||
return blocks;
|
||||
}
|
||||
|
||||
const DEFAULT_ECHO_FORMAT = '📝 "{transcript}"';
|
||||
|
||||
/**
|
||||
* Formats a transcript echo message using the configured format string.
|
||||
* Replaces `{transcript}` placeholder with the actual transcript text.
|
||||
*/
|
||||
function formatEchoTranscript(transcript: string, format: string): string {
|
||||
return format.replace("{transcript}", transcript);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends the transcript echo back to the originating chat.
|
||||
* Best-effort: logs on failure, never throws.
|
||||
*/
|
||||
async function sendTranscriptEcho(params: {
|
||||
ctx: MsgContext;
|
||||
cfg: OpenClawConfig;
|
||||
transcript: string;
|
||||
format: string;
|
||||
}): Promise<void> {
|
||||
const { ctx, cfg, transcript, format } = params;
|
||||
const channel = ctx.Provider ?? ctx.Surface ?? "";
|
||||
const to = ctx.OriginatingTo ?? ctx.From ?? "";
|
||||
|
||||
if (!channel || !to) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose("media: echo-transcript skipped (no channel/to resolved from ctx)");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const normalizedChannel = channel.trim().toLowerCase();
|
||||
if (!isDeliverableMessageChannel(normalizedChannel)) {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(
|
||||
`media: echo-transcript skipped (channel "${String(normalizedChannel)}" is not deliverable)`,
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const text = formatEchoTranscript(transcript, format);
|
||||
|
||||
try {
|
||||
const { deliverOutboundPayloads } = await import("../infra/outbound/deliver.js");
|
||||
await deliverOutboundPayloads({
|
||||
cfg,
|
||||
channel: normalizedChannel,
|
||||
to,
|
||||
accountId: ctx.AccountId ?? undefined,
|
||||
threadId: ctx.MessageThreadId ?? undefined,
|
||||
payloads: [{ text }],
|
||||
bestEffort: true,
|
||||
});
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`media: echo-transcript sent to ${normalizedChannel}/${to}`);
|
||||
}
|
||||
} catch (err) {
|
||||
logVerbose(`media: echo-transcript delivery failed: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function applyMediaUnderstanding(params: {
|
||||
ctx: MsgContext;
|
||||
cfg: OpenClawConfig;
|
||||
@@ -528,6 +591,16 @@ export async function applyMediaUnderstanding(params: {
|
||||
ctx.CommandBody = transcript;
|
||||
ctx.RawBody = transcript;
|
||||
}
|
||||
// Echo transcript back to chat before agent processing, if configured.
|
||||
const audioCfg = cfg.tools?.media?.audio;
|
||||
if (audioCfg?.echoTranscript && transcript) {
|
||||
await sendTranscriptEcho({
|
||||
ctx,
|
||||
cfg,
|
||||
transcript,
|
||||
format: audioCfg.echoFormat ?? DEFAULT_ECHO_FORMAT,
|
||||
});
|
||||
}
|
||||
} else if (originalUserText) {
|
||||
ctx.CommandBody = originalUserText;
|
||||
ctx.RawBody = originalUserText;
|
||||
|
||||
Reference in New Issue
Block a user