mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-13 10:50:35 +00:00
feat: restore voice-call plugin parity
This commit is contained in:
264
extensions/voice-call/src/providers/tts-openai.ts
Normal file
264
extensions/voice-call/src/providers/tts-openai.ts
Normal file
@@ -0,0 +1,264 @@
|
||||
/**
|
||||
* OpenAI TTS Provider
|
||||
*
|
||||
* Generates speech audio using OpenAI's text-to-speech API.
|
||||
* Handles audio format conversion for telephony (mu-law 8kHz).
|
||||
*
|
||||
* Best practices from OpenAI docs:
|
||||
* - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions)
|
||||
* - Use tts-1 for lower latency, tts-1-hd for higher quality
|
||||
* - Use marin or cedar voices for best quality
|
||||
* - Use pcm or wav format for fastest response times
|
||||
*
|
||||
* @see https://platform.openai.com/docs/guides/text-to-speech
|
||||
*/
|
||||
|
||||
/**
|
||||
* OpenAI TTS configuration.
|
||||
*/
|
||||
export interface OpenAITTSConfig {
|
||||
/** OpenAI API key (uses OPENAI_API_KEY env if not set) */
|
||||
apiKey?: string;
|
||||
/**
|
||||
* TTS model:
|
||||
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
|
||||
* - tts-1: lower latency
|
||||
* - tts-1-hd: higher quality
|
||||
*/
|
||||
model?: string;
|
||||
/**
|
||||
* Voice to use. For best quality, use marin or cedar.
|
||||
* All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
|
||||
* Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
|
||||
*/
|
||||
voice?: string;
|
||||
/** Speed multiplier (0.25 to 4.0) */
|
||||
speed?: number;
|
||||
/**
|
||||
* Instructions for speech style (only works with gpt-4o-mini-tts model).
|
||||
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
|
||||
*/
|
||||
instructions?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Supported OpenAI TTS voices (all 13 built-in voices).
|
||||
* For best quality, use marin or cedar.
|
||||
* Note: tts-1 and tts-1-hd support a smaller set.
|
||||
*/
|
||||
export const OPENAI_TTS_VOICES = [
|
||||
"alloy",
|
||||
"ash",
|
||||
"ballad",
|
||||
"coral",
|
||||
"echo",
|
||||
"fable",
|
||||
"nova",
|
||||
"onyx",
|
||||
"sage",
|
||||
"shimmer",
|
||||
"verse",
|
||||
"marin",
|
||||
"cedar",
|
||||
] as const;
|
||||
|
||||
export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
|
||||
|
||||
/**
|
||||
* OpenAI TTS Provider for generating speech audio.
|
||||
*/
|
||||
export class OpenAITTSProvider {
|
||||
private apiKey: string;
|
||||
private model: string;
|
||||
private voice: OpenAITTSVoice;
|
||||
private speed: number;
|
||||
private instructions?: string;
|
||||
|
||||
constructor(config: OpenAITTSConfig = {}) {
|
||||
this.apiKey = config.apiKey || process.env.OPENAI_API_KEY || "";
|
||||
// Default to gpt-4o-mini-tts for intelligent realtime applications
|
||||
this.model = config.model || "gpt-4o-mini-tts";
|
||||
// Default to coral - good balance of quality and natural tone
|
||||
this.voice = (config.voice as OpenAITTSVoice) || "coral";
|
||||
this.speed = config.speed || 1.0;
|
||||
this.instructions = config.instructions;
|
||||
|
||||
if (!this.apiKey) {
|
||||
throw new Error(
|
||||
"OpenAI API key required (set OPENAI_API_KEY or pass apiKey)",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate speech audio from text.
|
||||
* Returns raw PCM audio data (24kHz, mono, 16-bit).
|
||||
*/
|
||||
async synthesize(text: string, instructions?: string): Promise<Buffer> {
|
||||
// Build request body
|
||||
const body: Record<string, unknown> = {
|
||||
model: this.model,
|
||||
input: text,
|
||||
voice: this.voice,
|
||||
response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE)
|
||||
speed: this.speed,
|
||||
};
|
||||
|
||||
// Add instructions if using gpt-4o-mini-tts model
|
||||
const effectiveInstructions = instructions || this.instructions;
|
||||
if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
|
||||
body.instructions = effectiveInstructions;
|
||||
}
|
||||
|
||||
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.text();
|
||||
throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`);
|
||||
}
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
return Buffer.from(arrayBuffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate speech and convert to mu-law format for Twilio.
|
||||
* Twilio Media Streams expect 8kHz mono mu-law audio.
|
||||
*/
|
||||
async synthesizeForTwilio(text: string): Promise<Buffer> {
|
||||
// Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono)
|
||||
const pcm24k = await this.synthesize(text);
|
||||
|
||||
// Resample from 24kHz to 8kHz
|
||||
const pcm8k = resample24kTo8k(pcm24k);
|
||||
|
||||
// Encode to mu-law
|
||||
return pcmToMulaw(pcm8k);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resample 24kHz PCM to 8kHz using linear interpolation.
|
||||
* Input/output: 16-bit signed little-endian mono.
|
||||
*/
|
||||
function resample24kTo8k(input: Buffer): Buffer {
|
||||
const inputSamples = input.length / 2;
|
||||
const outputSamples = Math.floor(inputSamples / 3);
|
||||
const output = Buffer.alloc(outputSamples * 2);
|
||||
|
||||
for (let i = 0; i < outputSamples; i++) {
|
||||
// Calculate position in input (3:1 ratio)
|
||||
const srcPos = i * 3;
|
||||
const srcIdx = srcPos * 2;
|
||||
|
||||
if (srcIdx + 3 < input.length) {
|
||||
// Linear interpolation between samples
|
||||
const s0 = input.readInt16LE(srcIdx);
|
||||
const s1 = input.readInt16LE(srcIdx + 2);
|
||||
const frac = srcPos % 1 || 0;
|
||||
const sample = Math.round(s0 + frac * (s1 - s0));
|
||||
output.writeInt16LE(clamp16(sample), i * 2);
|
||||
} else {
|
||||
// Last sample
|
||||
output.writeInt16LE(input.readInt16LE(srcIdx), i * 2);
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clamp value to 16-bit signed integer range.
|
||||
*/
|
||||
function clamp16(value: number): number {
|
||||
return Math.max(-32768, Math.min(32767, value));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert 16-bit PCM to 8-bit mu-law.
|
||||
* Standard G.711 mu-law encoding for telephony.
|
||||
*/
|
||||
function pcmToMulaw(pcm: Buffer): Buffer {
|
||||
const samples = pcm.length / 2;
|
||||
const mulaw = Buffer.alloc(samples);
|
||||
|
||||
for (let i = 0; i < samples; i++) {
|
||||
const sample = pcm.readInt16LE(i * 2);
|
||||
mulaw[i] = linearToMulaw(sample);
|
||||
}
|
||||
|
||||
return mulaw;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a single 16-bit linear sample to 8-bit mu-law.
|
||||
* Implements ITU-T G.711 mu-law encoding.
|
||||
*/
|
||||
function linearToMulaw(sample: number): number {
|
||||
const BIAS = 132;
|
||||
const CLIP = 32635;
|
||||
|
||||
// Get sign bit
|
||||
const sign = sample < 0 ? 0x80 : 0;
|
||||
if (sample < 0) sample = -sample;
|
||||
|
||||
// Clip to prevent overflow
|
||||
if (sample > CLIP) sample = CLIP;
|
||||
|
||||
// Add bias and find segment
|
||||
sample += BIAS;
|
||||
let exponent = 7;
|
||||
for (
|
||||
let expMask = 0x4000;
|
||||
(sample & expMask) === 0 && exponent > 0;
|
||||
exponent--, expMask >>= 1
|
||||
) {
|
||||
// Find the segment (exponent)
|
||||
}
|
||||
|
||||
// Extract mantissa bits
|
||||
const mantissa = (sample >> (exponent + 3)) & 0x0f;
|
||||
|
||||
// Combine into mu-law byte (inverted for transmission)
|
||||
return ~(sign | (exponent << 4) | mantissa) & 0xff;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert 8-bit mu-law to 16-bit linear PCM.
|
||||
* Useful for decoding incoming audio.
|
||||
*/
|
||||
export function mulawToLinear(mulaw: number): number {
|
||||
// mu-law is transmitted inverted
|
||||
mulaw = ~mulaw & 0xff;
|
||||
|
||||
const sign = mulaw & 0x80;
|
||||
const exponent = (mulaw >> 4) & 0x07;
|
||||
const mantissa = mulaw & 0x0f;
|
||||
|
||||
let sample = ((mantissa << 3) + 132) << exponent;
|
||||
sample -= 132;
|
||||
|
||||
return sign ? -sample : sample;
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk audio buffer into 20ms frames for streaming.
|
||||
* At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law).
|
||||
*/
|
||||
export function chunkAudio(
|
||||
audio: Buffer,
|
||||
chunkSize = 160,
|
||||
): Generator<Buffer, void, unknown> {
|
||||
return (function* () {
|
||||
for (let i = 0; i < audio.length; i += chunkSize) {
|
||||
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
|
||||
}
|
||||
})();
|
||||
}
|
||||
Reference in New Issue
Block a user