mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-09 21:14:31 +00:00
fix(media): recognize MP3 and M4A as voice-compatible audio (#15438)
* fix(media): recognize MP3 and M4A as voice-compatible audio Telegram sendVoice supports OGG/Opus, MP3, and M4A, but isVoiceCompatibleAudio only recognized OGG/Opus formats. - Add MP3 and M4A extensions and MIME types - Use explicit MIME set instead of substring matching - Handle MIME parameters (e.g. 'audio/ogg; codecs=opus') - Add test coverage for all supported and unsupported formats * fix: narrow MIME allowlist per review feedback Remove audio/mp4 and audio/aac from voice MIME types — too broad. Keep only M4A-specific types (audio/x-m4a, audio/m4a). Add audio/mp4 and audio/aac as negative test cases. * fix: align voice compatibility and channel coverage (#15438) (thanks @azade-c) --------- Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
@@ -71,6 +71,7 @@ Docs: https://docs.openclaw.ai
|
|||||||
- Exec/Allowlist: allow multiline heredoc bodies (`<<`, `<<-`) while keeping multiline non-heredoc shell commands blocked, so exec approval parsing permits heredoc input safely without allowing general newline command chaining. (#13811) Thanks @mcaxtr.
|
- Exec/Allowlist: allow multiline heredoc bodies (`<<`, `<<-`) while keeping multiline non-heredoc shell commands blocked, so exec approval parsing permits heredoc input safely without allowing general newline command chaining. (#13811) Thanks @mcaxtr.
|
||||||
- Docs/Mermaid: remove hardcoded Mermaid init theme blocks from four docs diagrams so dark mode inherits readable theme defaults. (#15157) Thanks @heytulsiprasad.
|
- Docs/Mermaid: remove hardcoded Mermaid init theme blocks from four docs diagrams so dark mode inherits readable theme defaults. (#15157) Thanks @heytulsiprasad.
|
||||||
- Outbound/Threading: pass `replyTo` and `threadId` from `message send` tool actions through the core outbound send path to channel adapters, preserving thread/reply routing. (#14948) Thanks @mcaxtr.
|
- Outbound/Threading: pass `replyTo` and `threadId` from `message send` tool actions through the core outbound send path to channel adapters, preserving thread/reply routing. (#14948) Thanks @mcaxtr.
|
||||||
|
- Telegram/Matrix: treat MP3 and M4A (including `audio/mp4`) as voice-compatible for `asVoice` routing, and keep WAV/AAC falling back to regular audio sends. (#15438) Thanks @azade-c.
|
||||||
- Sessions/Agents: pass `agentId` when resolving existing transcript paths in reply runs so non-default agents and heartbeat/chat handlers no longer fail with `Session file path must be within sessions directory`. (#15141) Thanks @Goldenmonstew.
|
- Sessions/Agents: pass `agentId` when resolving existing transcript paths in reply runs so non-default agents and heartbeat/chat handlers no longer fail with `Session file path must be within sessions directory`. (#15141) Thanks @Goldenmonstew.
|
||||||
- Sessions/Agents: pass `agentId` through status and usage transcript-resolution paths (auto-reply, gateway usage APIs, and session cost/log loaders) so non-default agents can resolve absolute session files without path-validation failures. (#15103) Thanks @jalehman.
|
- Sessions/Agents: pass `agentId` through status and usage transcript-resolution paths (auto-reply, gateway usage APIs, and session cost/log loaders) so non-default agents can resolve absolute session files without path-validation failures. (#15103) Thanks @jalehman.
|
||||||
- Sessions: archive previous transcript files on `/new` and `/reset` session resets (including gateway `sessions.reset`) so stale transcripts do not accumulate on disk. (#14869) Thanks @mcaxtr.
|
- Sessions: archive previous transcript files on `/new` and `/reset` session resets (including gateway `sessions.reset`) so stale transcripts do not accumulate on disk. (#14869) Thanks @mcaxtr.
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ const loadWebMediaMock = vi.fn().mockResolvedValue({
|
|||||||
contentType: "image/png",
|
contentType: "image/png",
|
||||||
kind: "image",
|
kind: "image",
|
||||||
});
|
});
|
||||||
|
const mediaKindFromMimeMock = vi.fn(() => "image");
|
||||||
|
const isVoiceCompatibleAudioMock = vi.fn(() => false);
|
||||||
const getImageMetadataMock = vi.fn().mockResolvedValue(null);
|
const getImageMetadataMock = vi.fn().mockResolvedValue(null);
|
||||||
const resizeToJpegMock = vi.fn();
|
const resizeToJpegMock = vi.fn();
|
||||||
|
|
||||||
@@ -33,8 +35,8 @@ const runtimeStub = {
|
|||||||
},
|
},
|
||||||
media: {
|
media: {
|
||||||
loadWebMedia: (...args: unknown[]) => loadWebMediaMock(...args),
|
loadWebMedia: (...args: unknown[]) => loadWebMediaMock(...args),
|
||||||
mediaKindFromMime: () => "image",
|
mediaKindFromMime: (...args: unknown[]) => mediaKindFromMimeMock(...args),
|
||||||
isVoiceCompatibleAudio: () => false,
|
isVoiceCompatibleAudio: (...args: unknown[]) => isVoiceCompatibleAudioMock(...args),
|
||||||
getImageMetadata: (...args: unknown[]) => getImageMetadataMock(...args),
|
getImageMetadata: (...args: unknown[]) => getImageMetadataMock(...args),
|
||||||
resizeToJpeg: (...args: unknown[]) => resizeToJpegMock(...args),
|
resizeToJpeg: (...args: unknown[]) => resizeToJpegMock(...args),
|
||||||
},
|
},
|
||||||
@@ -71,6 +73,8 @@ describe("sendMessageMatrix media", () => {
|
|||||||
|
|
||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
vi.clearAllMocks();
|
vi.clearAllMocks();
|
||||||
|
mediaKindFromMimeMock.mockReturnValue("image");
|
||||||
|
isVoiceCompatibleAudioMock.mockReturnValue(false);
|
||||||
setMatrixRuntime(runtimeStub);
|
setMatrixRuntime(runtimeStub);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -133,6 +137,66 @@ describe("sendMessageMatrix media", () => {
|
|||||||
expect(content.url).toBeUndefined();
|
expect(content.url).toBeUndefined();
|
||||||
expect(content.file?.url).toBe("mxc://example/file");
|
expect(content.file?.url).toBe("mxc://example/file");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("marks voice metadata and sends caption follow-up when audioAsVoice is compatible", async () => {
|
||||||
|
const { client, sendMessage } = makeClient();
|
||||||
|
mediaKindFromMimeMock.mockReturnValue("audio");
|
||||||
|
isVoiceCompatibleAudioMock.mockReturnValue(true);
|
||||||
|
loadWebMediaMock.mockResolvedValueOnce({
|
||||||
|
buffer: Buffer.from("audio"),
|
||||||
|
fileName: "clip.mp3",
|
||||||
|
contentType: "audio/mpeg",
|
||||||
|
kind: "audio",
|
||||||
|
});
|
||||||
|
|
||||||
|
await sendMessageMatrix("room:!room:example", "voice caption", {
|
||||||
|
client,
|
||||||
|
mediaUrl: "file:///tmp/clip.mp3",
|
||||||
|
audioAsVoice: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(isVoiceCompatibleAudioMock).toHaveBeenCalledWith({
|
||||||
|
contentType: "audio/mpeg",
|
||||||
|
fileName: "clip.mp3",
|
||||||
|
});
|
||||||
|
expect(sendMessage).toHaveBeenCalledTimes(2);
|
||||||
|
const mediaContent = sendMessage.mock.calls[0]?.[1] as {
|
||||||
|
msgtype?: string;
|
||||||
|
body?: string;
|
||||||
|
"org.matrix.msc3245.voice"?: Record<string, never>;
|
||||||
|
};
|
||||||
|
expect(mediaContent.msgtype).toBe("m.audio");
|
||||||
|
expect(mediaContent.body).toBe("Voice message");
|
||||||
|
expect(mediaContent["org.matrix.msc3245.voice"]).toEqual({});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("keeps regular audio payload when audioAsVoice media is incompatible", async () => {
|
||||||
|
const { client, sendMessage } = makeClient();
|
||||||
|
mediaKindFromMimeMock.mockReturnValue("audio");
|
||||||
|
isVoiceCompatibleAudioMock.mockReturnValue(false);
|
||||||
|
loadWebMediaMock.mockResolvedValueOnce({
|
||||||
|
buffer: Buffer.from("audio"),
|
||||||
|
fileName: "clip.wav",
|
||||||
|
contentType: "audio/wav",
|
||||||
|
kind: "audio",
|
||||||
|
});
|
||||||
|
|
||||||
|
await sendMessageMatrix("room:!room:example", "voice caption", {
|
||||||
|
client,
|
||||||
|
mediaUrl: "file:///tmp/clip.wav",
|
||||||
|
audioAsVoice: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(sendMessage).toHaveBeenCalledTimes(1);
|
||||||
|
const mediaContent = sendMessage.mock.calls[0]?.[1] as {
|
||||||
|
msgtype?: string;
|
||||||
|
body?: string;
|
||||||
|
"org.matrix.msc3245.voice"?: Record<string, never>;
|
||||||
|
};
|
||||||
|
expect(mediaContent.msgtype).toBe("m.audio");
|
||||||
|
expect(mediaContent.body).toBe("voice caption");
|
||||||
|
expect(mediaContent["org.matrix.msc3245.voice"]).toBeUndefined();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("sendMessageMatrix threads", () => {
|
describe("sendMessageMatrix threads", () => {
|
||||||
|
|||||||
43
src/media/audio.test.ts
Normal file
43
src/media/audio.test.ts
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import { describe, expect, it } from "vitest";
|
||||||
|
import { isVoiceCompatibleAudio } from "./audio.js";
|
||||||
|
|
||||||
|
describe("isVoiceCompatibleAudio", () => {
|
||||||
|
it.each([
|
||||||
|
{ contentType: "audio/ogg", fileName: null },
|
||||||
|
{ contentType: "audio/opus", fileName: null },
|
||||||
|
{ contentType: "audio/ogg; codecs=opus", fileName: null },
|
||||||
|
{ contentType: "audio/mpeg", fileName: null },
|
||||||
|
{ contentType: "audio/mp3", fileName: null },
|
||||||
|
{ contentType: "audio/mp4", fileName: null },
|
||||||
|
{ contentType: "audio/mp4; codecs=mp4a.40.2", fileName: null },
|
||||||
|
{ contentType: "audio/x-m4a", fileName: null },
|
||||||
|
{ contentType: "audio/m4a", fileName: null },
|
||||||
|
])("returns true for MIME type $contentType", (opts) => {
|
||||||
|
expect(isVoiceCompatibleAudio(opts)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it.each([".ogg", ".oga", ".opus", ".mp3", ".m4a"])("returns true for extension %s", (ext) => {
|
||||||
|
expect(isVoiceCompatibleAudio({ fileName: `voice${ext}` })).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it.each([
|
||||||
|
{ contentType: "audio/wav", fileName: null },
|
||||||
|
{ contentType: "audio/flac", fileName: null },
|
||||||
|
{ contentType: "audio/aac", fileName: null },
|
||||||
|
{ contentType: "video/mp4", fileName: null },
|
||||||
|
])("returns false for unsupported MIME $contentType", (opts) => {
|
||||||
|
expect(isVoiceCompatibleAudio(opts)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it.each([".wav", ".flac", ".webm"])("returns false for extension %s", (ext) => {
|
||||||
|
expect(isVoiceCompatibleAudio({ fileName: `audio${ext}` })).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false when no contentType and no fileName", () => {
|
||||||
|
expect(isVoiceCompatibleAudio({})).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("prefers MIME type over extension", () => {
|
||||||
|
expect(isVoiceCompatibleAudio({ contentType: "audio/mpeg", fileName: "file.wav" })).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -1,14 +1,32 @@
|
|||||||
import { getFileExtension } from "./mime.js";
|
import { getFileExtension } from "./mime.js";
|
||||||
|
|
||||||
const VOICE_AUDIO_EXTENSIONS = new Set([".oga", ".ogg", ".opus"]);
|
const VOICE_AUDIO_EXTENSIONS = new Set([".oga", ".ogg", ".opus", ".mp3", ".m4a"]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MIME types compatible with voice messages.
|
||||||
|
* Telegram sendVoice supports OGG/Opus, MP3, and M4A.
|
||||||
|
* https://core.telegram.org/bots/api#sendvoice
|
||||||
|
*/
|
||||||
|
const VOICE_MIME_TYPES = new Set([
|
||||||
|
"audio/ogg",
|
||||||
|
"audio/opus",
|
||||||
|
"audio/mpeg",
|
||||||
|
"audio/mp3",
|
||||||
|
"audio/mp4",
|
||||||
|
"audio/x-m4a",
|
||||||
|
"audio/m4a",
|
||||||
|
]);
|
||||||
|
|
||||||
export function isVoiceCompatibleAudio(opts: {
|
export function isVoiceCompatibleAudio(opts: {
|
||||||
contentType?: string | null;
|
contentType?: string | null;
|
||||||
fileName?: string | null;
|
fileName?: string | null;
|
||||||
}): boolean {
|
}): boolean {
|
||||||
const mime = opts.contentType?.toLowerCase();
|
const mime = opts.contentType?.toLowerCase().trim();
|
||||||
if (mime && (mime.includes("ogg") || mime.includes("opus"))) {
|
if (mime) {
|
||||||
return true;
|
const baseMime = mime.split(";")[0].trim();
|
||||||
|
if (VOICE_MIME_TYPES.has(baseMime)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
const fileName = opts.fileName?.trim();
|
const fileName = opts.fileName?.trim();
|
||||||
if (!fileName) {
|
if (!fileName) {
|
||||||
|
|||||||
@@ -436,6 +436,41 @@ describe("sendMessageTelegram", () => {
|
|||||||
sendVoice: typeof sendVoice;
|
sendVoice: typeof sendVoice;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
loadWebMedia.mockResolvedValueOnce({
|
||||||
|
buffer: Buffer.from("audio"),
|
||||||
|
contentType: "audio/wav",
|
||||||
|
fileName: "clip.wav",
|
||||||
|
});
|
||||||
|
|
||||||
|
await sendMessageTelegram(chatId, "caption", {
|
||||||
|
token: "tok",
|
||||||
|
api,
|
||||||
|
mediaUrl: "https://example.com/clip.wav",
|
||||||
|
asVoice: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(sendAudio).toHaveBeenCalledWith(chatId, expect.anything(), {
|
||||||
|
caption: "caption",
|
||||||
|
parse_mode: "HTML",
|
||||||
|
});
|
||||||
|
expect(sendVoice).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("sends MP3 as voice when asVoice is true", async () => {
|
||||||
|
const chatId = "123";
|
||||||
|
const sendAudio = vi.fn().mockResolvedValue({
|
||||||
|
message_id: 16,
|
||||||
|
chat: { id: chatId },
|
||||||
|
});
|
||||||
|
const sendVoice = vi.fn().mockResolvedValue({
|
||||||
|
message_id: 17,
|
||||||
|
chat: { id: chatId },
|
||||||
|
});
|
||||||
|
const api = { sendAudio, sendVoice } as unknown as {
|
||||||
|
sendAudio: typeof sendAudio;
|
||||||
|
sendVoice: typeof sendVoice;
|
||||||
|
};
|
||||||
|
|
||||||
loadWebMedia.mockResolvedValueOnce({
|
loadWebMedia.mockResolvedValueOnce({
|
||||||
buffer: Buffer.from("audio"),
|
buffer: Buffer.from("audio"),
|
||||||
contentType: "audio/mpeg",
|
contentType: "audio/mpeg",
|
||||||
@@ -449,11 +484,11 @@ describe("sendMessageTelegram", () => {
|
|||||||
asVoice: true,
|
asVoice: true,
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(sendAudio).toHaveBeenCalledWith(chatId, expect.anything(), {
|
expect(sendVoice).toHaveBeenCalledWith(chatId, expect.anything(), {
|
||||||
caption: "caption",
|
caption: "caption",
|
||||||
parse_mode: "HTML",
|
parse_mode: "HTML",
|
||||||
});
|
});
|
||||||
expect(sendVoice).not.toHaveBeenCalled();
|
expect(sendAudio).not.toHaveBeenCalled();
|
||||||
});
|
});
|
||||||
|
|
||||||
it("includes message_thread_id for forum topic messages", async () => {
|
it("includes message_thread_id for forum topic messages", async () => {
|
||||||
|
|||||||
@@ -18,13 +18,13 @@ describe("resolveTelegramVoiceSend", () => {
|
|||||||
const logFallback = vi.fn();
|
const logFallback = vi.fn();
|
||||||
const result = resolveTelegramVoiceSend({
|
const result = resolveTelegramVoiceSend({
|
||||||
wantsVoice: true,
|
wantsVoice: true,
|
||||||
contentType: "audio/mpeg",
|
contentType: "audio/wav",
|
||||||
fileName: "track.mp3",
|
fileName: "track.wav",
|
||||||
logFallback,
|
logFallback,
|
||||||
});
|
});
|
||||||
expect(result.useVoice).toBe(false);
|
expect(result.useVoice).toBe(false);
|
||||||
expect(logFallback).toHaveBeenCalledWith(
|
expect(logFallback).toHaveBeenCalledWith(
|
||||||
"Telegram voice requested but media is audio/mpeg (track.mp3); sending as audio file instead.",
|
"Telegram voice requested but media is audio/wav (track.wav); sending as audio file instead.",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -39,4 +39,19 @@ describe("resolveTelegramVoiceSend", () => {
|
|||||||
expect(result.useVoice).toBe(true);
|
expect(result.useVoice).toBe(true);
|
||||||
expect(logFallback).not.toHaveBeenCalled();
|
expect(logFallback).not.toHaveBeenCalled();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it.each([
|
||||||
|
{ contentType: "audio/mpeg", fileName: "track.mp3" },
|
||||||
|
{ contentType: "audio/mp4", fileName: "track.m4a" },
|
||||||
|
])("keeps voice for compatible MIME $contentType", ({ contentType, fileName }) => {
|
||||||
|
const logFallback = vi.fn();
|
||||||
|
const result = resolveTelegramVoiceSend({
|
||||||
|
wantsVoice: true,
|
||||||
|
contentType,
|
||||||
|
fileName,
|
||||||
|
logFallback,
|
||||||
|
});
|
||||||
|
expect(result.useVoice).toBe(true);
|
||||||
|
expect(logFallback).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user