Add runtime.stt.transcribeAudioFile for plugin STT access

Expose audio transcription through the PluginRuntime so external
plugins (e.g. marmot) can use openclaw's media-understanding provider
framework without importing unexported internal modules.

The new transcribeAudioFile() wraps runCapability({capability: "audio"})
and reads provider/model/apiKey from tools.media.audio in the config,
matching the pattern used by the Discord VC implementation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
benthecarman
2026-02-20 21:52:08 -06:00
committed by Peter Steinberger
parent f7b0378ccb
commit faa4ffec03
4 changed files with 61 additions and 0 deletions

View File

@@ -0,0 +1,51 @@
import type { OpenClawConfig } from "../config/config.js";
import {
buildProviderRegistry,
createMediaAttachmentCache,
normalizeMediaAttachments,
runCapability,
} from "./runner.js";
/**
* Transcribe an audio file using the configured media-understanding provider.
*
* Reads provider/model/apiKey from `tools.media.audio` in the openclaw config,
* falling back through configured models until one succeeds.
*
* This is the runtime-exposed entry point for external plugins (e.g. marmot)
* that need STT without importing internal media-understanding modules directly.
*/
export async function transcribeAudioFile(params: {
filePath: string;
cfg: OpenClawConfig;
agentDir?: string;
mime?: string;
}): Promise<{ text: string | undefined }> {
const ctx = {
MediaPath: params.filePath,
MediaType: params.mime ?? "audio/wav",
};
const attachments = normalizeMediaAttachments(ctx);
if (attachments.length === 0) {
return { text: undefined };
}
const cache = createMediaAttachmentCache(attachments);
const providerRegistry = buildProviderRegistry();
try {
const result = await runCapability({
capability: "audio",
cfg: params.cfg,
ctx,
attachments: cache,
media: attachments,
agentDir: params.agentDir,
providerRegistry,
config: params.cfg.tools?.media?.audio,
});
const output = result.outputs.find((entry) => entry.kind === "audio.transcription");
const text = output?.text?.trim();
return { text: text || undefined };
} finally {
await cache.cleanup();
}
}