Add runtime.stt.transcribeAudioFile for plugin STT access

Expose audio transcription through the PluginRuntime so external plugins (e.g. marmot) can use openclaw's media-understanding provider framework without importing unexported internal modules. The new transcribeAudioFile() wraps runCapability({capability: "audio"}) and reads provider/model/apiKey from tools.media.audio in the config, matching the pattern used by the Discord VC implementation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-08 10:51:23 +00:00 · 2026-02-20 21:52:08 -06:00
parent f7b0378ccb
commit faa4ffec03
4 changed files with 61 additions and 0 deletions
--- a/src/media-understanding/transcribe-audio.ts
+++ b/src/media-understanding/transcribe-audio.ts
@@ -0,0 +1,51 @@
+import type { OpenClawConfig } from "../config/config.js";
+import {
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
+
+/**
+ * Transcribe an audio file using the configured media-understanding provider.
+ *
+ * Reads provider/model/apiKey from `tools.media.audio` in the openclaw config,
+ * falling back through configured models until one succeeds.
+ *
+ * This is the runtime-exposed entry point for external plugins (e.g. marmot)
+ * that need STT without importing internal media-understanding modules directly.
+ */
+export async function transcribeAudioFile(params: {
+  filePath: string;
+  cfg: OpenClawConfig;
+  agentDir?: string;
+  mime?: string;
+}): Promise<{ text: string | undefined }> {
+  const ctx = {
+    MediaPath: params.filePath,
+    MediaType: params.mime ?? "audio/wav",
+  };
+  const attachments = normalizeMediaAttachments(ctx);
+  if (attachments.length === 0) {
+    return { text: undefined };
+  }
+  const cache = createMediaAttachmentCache(attachments);
+  const providerRegistry = buildProviderRegistry();
+  try {
+    const result = await runCapability({
+      capability: "audio",
+      cfg: params.cfg,
+      ctx,
+      attachments: cache,
+      media: attachments,
+      agentDir: params.agentDir,
+      providerRegistry,
+      config: params.cfg.tools?.media?.audio,
+    });
+    const output = result.outputs.find((entry) => entry.kind === "audio.transcription");
+    const text = output?.text?.trim();
+    return { text: text || undefined };
+  } finally {
+    await cache.cleanup();
+  }
+}