feat(discord): add voice message support

Adds support for sending Discord voice messages via the message tool with asVoice: true parameter. Voice messages require: - OGG/Opus format (auto-converted if needed via ffmpeg) - Waveform data (generated from audio samples) - Duration in seconds - Message flag 8192 (IS_VOICE_MESSAGE) Implementation: - New voice-message.ts with audio processing utilities - getAudioDuration() using ffprobe - generateWaveform() samples audio and creates base64 waveform - ensureOggOpus() converts audio to required format - sendDiscordVoiceMessage() handles 3-step Discord upload process Usage: message(action='send', channel='discord', target='...', path='/path/to/audio.mp3', asVoice=true) Note: Voice messages cannot include text content (Discord limitation)
2026-05-08 22:38:26 +00:00 · 2026-02-02 17:00:19 +01:00
parent aec3221391
commit a09e4fac3f
5 changed files with 444 additions and 1 deletions
--- a/src/discord/voice-message.ts
+++ b/src/discord/voice-message.ts
@@ -0,0 +1,325 @@
+/**
+ * Discord Voice Message Support
+ *
+ * Implements sending voice messages via Discord's API.
+ * Voice messages require:
+ * - OGG/Opus format audio
+ * - Waveform data (base64 encoded, up to 256 samples, 0-255 values)
+ * - Duration in seconds
+ * - Message flag 8192 (IS_VOICE_MESSAGE)
+ * - No other content (text, embeds, etc.)
+ */
+
+import type { RequestClient } from "@buape/carbon";
+import { execFile } from "node:child_process";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { promisify } from "node:util";
+import type { RetryRunner } from "../infra/retry-policy.js";
+
+const execFileAsync = promisify(execFile);
+
+const DISCORD_VOICE_MESSAGE_FLAG = 8192;
+const WAVEFORM_SAMPLES = 256;
+
+export type VoiceMessageMetadata = {
+  durationSecs: number;
+  waveform: string; // base64 encoded
+};
+
+/**
+ * Get audio duration using ffprobe
+ */
+export async function getAudioDuration(filePath: string): Promise<number> {
+  try {
+    const { stdout } = await execFileAsync("ffprobe", [
+      "-v",
+      "error",
+      "-show_entries",
+      "format=duration",
+      "-of",
+      "csv=p=0",
+      filePath,
+    ]);
+    const duration = parseFloat(stdout.trim());
+    if (isNaN(duration)) {
+      throw new Error("Could not parse duration");
+    }
+    return Math.round(duration * 100) / 100; // Round to 2 decimal places
+  } catch (err) {
+    throw new Error(`Failed to get audio duration: ${err instanceof Error ? err.message : err}`);
+  }
+}
+
+/**
+ * Generate waveform data from audio file using ffmpeg
+ * Returns base64 encoded byte array of amplitude samples (0-255)
+ */
+export async function generateWaveform(filePath: string): Promise<string> {
+  try {
+    // Use ffmpeg to extract raw audio samples and compute amplitudes
+    // We'll get the peak amplitude for each segment of the audio
+    const { stdout } = await execFileAsync(
+      "ffmpeg",
+      [
+        "-i",
+        filePath,
+        "-af",
+        `aresample=8000,asetnsamples=n=${WAVEFORM_SAMPLES}:p=0,astats=metadata=1:reset=1`,
+        "-f",
+        "null",
+        "-",
+      ],
+      { encoding: "buffer", maxBuffer: 1024 * 1024 },
+    );
+
+    // Fallback: generate a simple waveform by sampling the audio
+    // This is a simplified approach - extract raw PCM and sample it
+    const waveformData = await generateWaveformFromPcm(filePath);
+    return waveformData;
+  } catch {
+    // If ffmpeg approach fails, generate a placeholder waveform
+    return generatePlaceholderWaveform();
+  }
+}
+
+/**
+ * Generate waveform by extracting raw PCM data and sampling amplitudes
+ */
+async function generateWaveformFromPcm(filePath: string): Promise<string> {
+  const tempDir = os.tmpdir();
+  const tempPcm = path.join(tempDir, `waveform-${Date.now()}.raw`);
+
+  try {
+    // Convert to raw 16-bit signed PCM, mono, 8kHz
+    await execFileAsync("ffmpeg", [
+      "-y",
+      "-i",
+      filePath,
+      "-f",
+      "s16le",
+      "-acodec",
+      "pcm_s16le",
+      "-ac",
+      "1",
+      "-ar",
+      "8000",
+      tempPcm,
+    ]);
+
+    const pcmData = await fs.readFile(tempPcm);
+    const samples = new Int16Array(pcmData.buffer, pcmData.byteOffset, pcmData.byteLength / 2);
+
+    // Sample the PCM data to get WAVEFORM_SAMPLES points
+    const step = Math.max(1, Math.floor(samples.length / WAVEFORM_SAMPLES));
+    const waveform: number[] = [];
+
+    for (let i = 0; i < WAVEFORM_SAMPLES && i * step < samples.length; i++) {
+      // Get average absolute amplitude for this segment
+      let sum = 0;
+      let count = 0;
+      for (let j = 0; j < step && i * step + j < samples.length; j++) {
+        sum += Math.abs(samples[i * step + j]!);
+        count++;
+      }
+      const avg = count > 0 ? sum / count : 0;
+      // Normalize to 0-255 (16-bit signed max is 32767)
+      const normalized = Math.min(255, Math.round((avg / 32767) * 255));
+      waveform.push(normalized);
+    }
+
+    // Pad with zeros if we don't have enough samples
+    while (waveform.length < WAVEFORM_SAMPLES) {
+      waveform.push(0);
+    }
+
+    return Buffer.from(waveform).toString("base64");
+  } finally {
+    // Clean up temp file
+    try {
+      await fs.unlink(tempPcm);
+    } catch {
+      // Ignore cleanup errors
+    }
+  }
+}
+
+/**
+ * Generate a placeholder waveform (for when audio processing fails)
+ */
+function generatePlaceholderWaveform(): string {
+  // Generate a simple sine-wave-like pattern
+  const waveform: number[] = [];
+  for (let i = 0; i < WAVEFORM_SAMPLES; i++) {
+    const value = Math.round(128 + 64 * Math.sin((i / WAVEFORM_SAMPLES) * Math.PI * 8));
+    waveform.push(Math.min(255, Math.max(0, value)));
+  }
+  return Buffer.from(waveform).toString("base64");
+}
+
+/**
+ * Convert audio file to OGG/Opus format if needed
+ * Returns path to the OGG file (may be same as input if already OGG/Opus)
+ */
+export async function ensureOggOpus(filePath: string): Promise<{ path: string; cleanup: boolean }> {
+  const ext = path.extname(filePath).toLowerCase();
+
+  // Check if already OGG
+  if (ext === ".ogg") {
+    // Verify it's Opus codec, not Vorbis (Vorbis won't play on mobile)
+    try {
+      const { stdout } = await execFileAsync("ffprobe", [
+        "-v",
+        "error",
+        "-select_streams",
+        "a:0",
+        "-show_entries",
+        "stream=codec_name",
+        "-of",
+        "csv=p=0",
+        filePath,
+      ]);
+      if (stdout.trim().toLowerCase() === "opus") {
+        return { path: filePath, cleanup: false };
+      }
+    } catch {
+      // If probe fails, convert anyway
+    }
+  }
+
+  // Convert to OGG/Opus
+  const tempDir = os.tmpdir();
+  const outputPath = path.join(tempDir, `voice-${Date.now()}.ogg`);
+
+  await execFileAsync("ffmpeg", [
+    "-y",
+    "-i",
+    filePath,
+    "-c:a",
+    "libopus",
+    "-b:a",
+    "64k",
+    outputPath,
+  ]);
+
+  return { path: outputPath, cleanup: true };
+}
+
+/**
+ * Get voice message metadata (duration and waveform)
+ */
+export async function getVoiceMessageMetadata(filePath: string): Promise<VoiceMessageMetadata> {
+  const [durationSecs, waveform] = await Promise.all([
+    getAudioDuration(filePath),
+    generateWaveform(filePath),
+  ]);
+
+  return { durationSecs, waveform };
+}
+
+type UploadUrlResponse = {
+  attachments: Array<{
+    id: number;
+    upload_url: string;
+    upload_filename: string;
+  }>;
+};
+
+/**
+ * Send a voice message to Discord
+ *
+ * This follows Discord's voice message protocol:
+ * 1. Request upload URL from Discord
+ * 2. Upload the OGG file to the provided URL
+ * 3. Send the message with flag 8192 and attachment metadata
+ */
+export async function sendDiscordVoiceMessage(
+  rest: RequestClient,
+  channelId: string,
+  audioBuffer: Buffer,
+  metadata: VoiceMessageMetadata,
+  replyTo: string | undefined,
+  request: RetryRunner,
+): Promise<{ id: string; channel_id: string }> {
+  const filename = "voice-message.ogg";
+  const fileSize = audioBuffer.byteLength;
+
+  // Step 1: Request upload URL
+  const uploadUrlResponse = (await request(
+    () =>
+      rest.post(`/channels/${channelId}/attachments`, {
+        body: {
+          files: [
+            {
+              filename,
+              file_size: fileSize,
+              id: "0",
+            },
+          ],
+        },
+      }) as Promise<UploadUrlResponse>,
+    "voice-upload-url",
+  )) as UploadUrlResponse;
+
+  if (!uploadUrlResponse.attachments?.[0]) {
+    throw new Error("Failed to get upload URL for voice message");
+  }
+
+  const { upload_url, upload_filename } = uploadUrlResponse.attachments[0];
+
+  // Step 2: Upload the file to Discord's CDN
+  const uploadResponse = await fetch(upload_url, {
+    method: "PUT",
+    headers: {
+      "Content-Type": "audio/ogg",
+    },
+    body: new Uint8Array(audioBuffer),
+  });
+
+  if (!uploadResponse.ok) {
+    throw new Error(`Failed to upload voice message: ${uploadResponse.status}`);
+  }
+
+  // Step 3: Send the message with voice message flag and metadata
+  const messagePayload: {
+    flags: number;
+    attachments: Array<{
+      id: string;
+      filename: string;
+      uploaded_filename: string;
+      duration_secs: number;
+      waveform: string;
+    }>;
+    message_reference?: { message_id: string; fail_if_not_exists: boolean };
+  } = {
+    flags: DISCORD_VOICE_MESSAGE_FLAG,
+    attachments: [
+      {
+        id: "0",
+        filename,
+        uploaded_filename: upload_filename,
+        duration_secs: metadata.durationSecs,
+        waveform: metadata.waveform,
+      },
+    ],
+  };
+
+  // Note: Voice messages cannot have content, but can have message_reference for replies
+  if (replyTo) {
+    messagePayload.message_reference = {
+      message_id: replyTo,
+      fail_if_not_exists: false,
+    };
+  }
+
+  const res = (await request(
+    () =>
+      rest.post(`/channels/${channelId}/messages`, {
+        body: messagePayload,
+      }) as Promise<{ id: string; channel_id: string }>,
+    "voice-message",
+  )) as { id: string; channel_id: string };
+
+  return res;
+}