mirror of
https://github.com/QuantumNous/new-api.git
synced 2026-03-30 08:34:11 +00:00
feat: opt minimax tts req struct
This commit is contained in:
@@ -34,17 +34,20 @@ func (a *Adaptor) ConvertAudioRequest(c *gin.Context, info *relaycommon.RelayInf
|
||||
return nil, errors.New("unsupported audio relay mode")
|
||||
}
|
||||
|
||||
voiceID := mapVoiceType(request.Voice)
|
||||
voiceID := request.Voice
|
||||
speed := request.Speed
|
||||
outputFormat := mapOutputFormat(request.ResponseFormat)
|
||||
|
||||
c.Set("response_format", outputFormat)
|
||||
outputFormat := request.ResponseFormat
|
||||
|
||||
minimaxRequest := MiniMaxTTSRequest{
|
||||
Model: getTTSModel(info.OriginModelName),
|
||||
Text: request.Input,
|
||||
VoiceID: voiceID,
|
||||
Speed: speed,
|
||||
Model: info.OriginModelName,
|
||||
Text: request.Input,
|
||||
VoiceSetting: VoiceSetting{
|
||||
VoiceID: voiceID,
|
||||
Speed: speed,
|
||||
},
|
||||
AudioSetting: &AudioSetting{
|
||||
Format: outputFormat,
|
||||
},
|
||||
OutputFormat: outputFormat,
|
||||
}
|
||||
|
||||
@@ -59,6 +62,11 @@ func (a *Adaptor) ConvertAudioRequest(c *gin.Context, info *relaycommon.RelayInf
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error marshalling minimax request: %w", err)
|
||||
}
|
||||
if outputFormat != "hex" {
|
||||
outputFormat = "url"
|
||||
}
|
||||
|
||||
c.Set("response_format", outputFormat)
|
||||
|
||||
// Debug: log the request structure
|
||||
fmt.Printf("MiniMax TTS Request: %s\n", string(jsonData))
|
||||
@@ -79,12 +87,6 @@ func (a *Adaptor) GetRequestURL(info *relaycommon.RelayInfo) (string, error) {
|
||||
|
||||
func (a *Adaptor) SetupRequestHeader(c *gin.Context, req *http.Header, info *relaycommon.RelayInfo) error {
|
||||
channel.SetupApiRequestHeader(info, c, req)
|
||||
|
||||
if info.RelayMode == constant.RelayModeAudioSpeech {
|
||||
req.Set("Content-Type", "application/json")
|
||||
return nil
|
||||
}
|
||||
|
||||
req.Set("Authorization", "Bearer "+info.ApiKey)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
package minimax
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/QuantumNous/new-api/dto"
|
||||
relaycommon "github.com/QuantumNous/new-api/relay/common"
|
||||
@@ -14,96 +16,78 @@ import (
|
||||
)
|
||||
|
||||
type MiniMaxTTSRequest struct {
|
||||
Model string `json:"model"`
|
||||
Text string `json:"text"`
|
||||
VoiceID string `json:"voice_id"`
|
||||
Speed float64 `json:"speed,omitempty"`
|
||||
Vol float64 `json:"vol,omitempty"`
|
||||
Pitch int `json:"pitch,omitempty"`
|
||||
AudioSampleRate int `json:"audio_sample_rate,omitempty"`
|
||||
OutputFormat string `json:"output_format,omitempty"`
|
||||
Model string `json:"model"`
|
||||
Text string `json:"text"`
|
||||
Stream bool `json:"stream,omitempty"`
|
||||
StreamOptions *StreamOptions `json:"stream_options,omitempty"`
|
||||
VoiceSetting VoiceSetting `json:"voice_setting"`
|
||||
PronunciationDict *PronunciationDict `json:"pronunciation_dict,omitempty"`
|
||||
AudioSetting *AudioSetting `json:"audio_setting,omitempty"`
|
||||
TimbreWeights []TimbreWeight `json:"timbre_weights,omitempty"`
|
||||
LanguageBoost string `json:"language_boost,omitempty"`
|
||||
VoiceModify *VoiceModify `json:"voice_modify,omitempty"`
|
||||
SubtitleEnable bool `json:"subtitle_enable,omitempty"`
|
||||
OutputFormat string `json:"output_format,omitempty"`
|
||||
AigcWatermark bool `json:"aigc_watermark,omitempty"`
|
||||
}
|
||||
|
||||
type StreamOptions struct {
|
||||
ExcludeAggregatedAudio bool `json:"exclude_aggregated_audio,omitempty"`
|
||||
}
|
||||
|
||||
type VoiceSetting struct {
|
||||
VoiceID string `json:"voice_id"`
|
||||
Speed float64 `json:"speed,omitempty"`
|
||||
Vol float64 `json:"vol,omitempty"`
|
||||
Pitch int `json:"pitch,omitempty"`
|
||||
Emotion string `json:"emotion,omitempty"`
|
||||
TextNormalization bool `json:"text_normalization,omitempty"`
|
||||
LatexRead bool `json:"latex_read,omitempty"`
|
||||
}
|
||||
|
||||
type PronunciationDict struct {
|
||||
Tone []string `json:"tone,omitempty"`
|
||||
}
|
||||
|
||||
type AudioSetting struct {
|
||||
SampleRate int `json:"sample_rate,omitempty"`
|
||||
Bitrate int `json:"bitrate,omitempty"`
|
||||
Format string `json:"format,omitempty"`
|
||||
Channel int `json:"channel,omitempty"`
|
||||
ForceCbr bool `json:"force_cbr,omitempty"`
|
||||
}
|
||||
|
||||
type TimbreWeight struct {
|
||||
VoiceID string `json:"voice_id"`
|
||||
Weight int `json:"weight"`
|
||||
}
|
||||
|
||||
type VoiceModify struct {
|
||||
Pitch int `json:"pitch,omitempty"`
|
||||
Intensity int `json:"intensity,omitempty"`
|
||||
Timbre int `json:"timbre,omitempty"`
|
||||
SoundEffects string `json:"sound_effects,omitempty"`
|
||||
}
|
||||
|
||||
type MiniMaxTTSResponse struct {
|
||||
Created int `json:"created"`
|
||||
Data []MiniMaxTTSData `json:"data"`
|
||||
ID string `json:"id"`
|
||||
Model string `json:"model"`
|
||||
Object string `json:"object"`
|
||||
Usage MiniMaxTTSUsage `json:"usage"`
|
||||
Data MiniMaxTTSData `json:"data"`
|
||||
ExtraInfo MiniMaxExtraInfo `json:"extra_info"`
|
||||
TraceID string `json:"trace_id"`
|
||||
BaseResp MiniMaxBaseResp `json:"base_resp"`
|
||||
}
|
||||
|
||||
type MiniMaxTTSData struct {
|
||||
Index int `json:"index"`
|
||||
Audio string `json:"audio"`
|
||||
Text string `json:"text"`
|
||||
FinishReason string `json:"finish_reason"`
|
||||
Audio string `json:"audio"`
|
||||
Status int `json:"status"`
|
||||
}
|
||||
|
||||
type MiniMaxTTSUsage struct {
|
||||
TotalTokens int `json:"total_tokens"`
|
||||
type MiniMaxExtraInfo struct {
|
||||
UsageCharacters int64 `json:"usage_characters"`
|
||||
}
|
||||
|
||||
type MiniMaxTTSErrorResponse struct {
|
||||
Error MiniMaxTTSError `json:"error"`
|
||||
}
|
||||
|
||||
type MiniMaxTTSError struct {
|
||||
Code string `json:"code"`
|
||||
Message string `json:"message"`
|
||||
Type string `json:"type"`
|
||||
}
|
||||
|
||||
// OpenAI voice to MiniMax voice_id mapping
|
||||
var openAIToMiniMaxVoiceMap = map[string]string{
|
||||
"alloy": "male-qn-qingse",
|
||||
"echo": "male-qn-jingying",
|
||||
"fable": "female-shaonv",
|
||||
"onyx": "male-qn-badao",
|
||||
"nova": "female-shaonv-jingpin",
|
||||
"shimmer": "female-yujie",
|
||||
// Add some standard MiniMax voice IDs
|
||||
"voice-1": "male-qn-qingse",
|
||||
"voice-2": "female-shaonv",
|
||||
}
|
||||
|
||||
// OpenAI response format to MiniMax output format mapping
|
||||
var responseFormatToOutputFormatMap = map[string]string{
|
||||
"mp3": "mp3",
|
||||
"opus": "mp3",
|
||||
"aac": "aac",
|
||||
"flac": "flac",
|
||||
"wav": "wav",
|
||||
"pcm": "pcm",
|
||||
}
|
||||
|
||||
// TTS model mapping - MiniMax uses speech-01 or speech-01-turbo
|
||||
var modelToTTSModelMap = map[string]string{
|
||||
"speech-01": "speech-01",
|
||||
"speech-01-turbo": "speech-01-turbo",
|
||||
"tts-1": "speech-01-turbo",
|
||||
"tts-1-hd": "speech-01",
|
||||
}
|
||||
|
||||
func mapVoiceType(openAIVoice string) string {
|
||||
if voice, ok := openAIToMiniMaxVoiceMap[openAIVoice]; ok {
|
||||
return voice
|
||||
}
|
||||
return "female-shaonv" // default voice
|
||||
}
|
||||
|
||||
func mapOutputFormat(responseFormat string) string {
|
||||
if format, ok := responseFormatToOutputFormatMap[responseFormat]; ok {
|
||||
return format
|
||||
}
|
||||
return "mp3" // default format
|
||||
}
|
||||
|
||||
func getTTSModel(modelName string) string {
|
||||
if ttsModel, ok := modelToTTSModelMap[modelName]; ok {
|
||||
return ttsModel
|
||||
}
|
||||
return "speech-01-turbo" // default model
|
||||
type MiniMaxBaseResp struct {
|
||||
StatusCode int64 `json:"status_code"`
|
||||
StatusMsg string `json:"status_msg"`
|
||||
}
|
||||
|
||||
func getContentTypeByFormat(format string) string {
|
||||
@@ -124,66 +108,64 @@ func handleTTSResponse(c *gin.Context, resp *http.Response, info *relaycommon.Re
|
||||
body, readErr := io.ReadAll(resp.Body)
|
||||
if readErr != nil {
|
||||
return nil, types.NewErrorWithStatusCode(
|
||||
errors.New("failed to read minimax response"),
|
||||
fmt.Errorf("failed to read minimax response: %w", readErr),
|
||||
types.ErrorCodeReadResponseBodyFailed,
|
||||
http.StatusInternalServerError,
|
||||
)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// First try to parse as error response
|
||||
var errorResp MiniMaxTTSErrorResponse
|
||||
if unmarshalErr := json.Unmarshal(body, &errorResp); unmarshalErr == nil && errorResp.Error.Code != "" {
|
||||
return nil, types.NewErrorWithStatusCode(
|
||||
errors.New(errorResp.Error.Message),
|
||||
types.ErrorCodeBadResponse,
|
||||
http.StatusBadRequest,
|
||||
)
|
||||
}
|
||||
|
||||
// Parse as successful response
|
||||
// Parse response
|
||||
var minimaxResp MiniMaxTTSResponse
|
||||
if unmarshalErr := json.Unmarshal(body, &minimaxResp); unmarshalErr != nil {
|
||||
return nil, types.NewErrorWithStatusCode(
|
||||
errors.New("failed to parse minimax response"),
|
||||
fmt.Errorf("failed to unmarshal minimax TTS response: %w", unmarshalErr),
|
||||
types.ErrorCodeBadResponseBody,
|
||||
http.StatusInternalServerError,
|
||||
)
|
||||
}
|
||||
|
||||
// Check base_resp status code
|
||||
if minimaxResp.BaseResp.StatusCode != 0 {
|
||||
return nil, types.NewErrorWithStatusCode(
|
||||
fmt.Errorf("minimax TTS error: %d - %s", minimaxResp.BaseResp.StatusCode, minimaxResp.BaseResp.StatusMsg),
|
||||
types.ErrorCodeBadResponse,
|
||||
http.StatusBadRequest,
|
||||
)
|
||||
}
|
||||
|
||||
// Check if we have audio data
|
||||
if len(minimaxResp.Data) == 0 || minimaxResp.Data[0].Audio == "" {
|
||||
if minimaxResp.Data.Audio == "" {
|
||||
return nil, types.NewErrorWithStatusCode(
|
||||
errors.New("no audio data in response"),
|
||||
fmt.Errorf("no audio data in minimax TTS response"),
|
||||
types.ErrorCodeBadResponse,
|
||||
http.StatusBadRequest,
|
||||
)
|
||||
}
|
||||
|
||||
// Decode base64 audio data
|
||||
audioData, decodeErr := base64.StdEncoding.DecodeString(minimaxResp.Data[0].Audio)
|
||||
if decodeErr != nil {
|
||||
return nil, types.NewErrorWithStatusCode(
|
||||
errors.New("failed to decode audio data"),
|
||||
types.ErrorCodeBadResponseBody,
|
||||
http.StatusInternalServerError,
|
||||
)
|
||||
}
|
||||
if strings.HasPrefix(minimaxResp.Data.Audio, "http") {
|
||||
c.Redirect(http.StatusFound, minimaxResp.Data.Audio)
|
||||
} else {
|
||||
// Handle hex-encoded audio data
|
||||
audioData, decodeErr := hex.DecodeString(minimaxResp.Data.Audio)
|
||||
if decodeErr != nil {
|
||||
return nil, types.NewErrorWithStatusCode(
|
||||
fmt.Errorf("failed to decode hex audio data: %w", decodeErr),
|
||||
types.ErrorCodeBadResponse,
|
||||
http.StatusInternalServerError,
|
||||
)
|
||||
}
|
||||
|
||||
// Get output format from context or default to mp3
|
||||
outputFormat := c.GetString("response_format")
|
||||
if outputFormat == "" {
|
||||
outputFormat = "mp3"
|
||||
}
|
||||
// Determine content type - default to mp3
|
||||
contentType := "audio/mpeg"
|
||||
|
||||
contentType := getContentTypeByFormat(outputFormat)
|
||||
c.Header("Content-Type", contentType)
|
||||
c.Data(http.StatusOK, contentType, audioData)
|
||||
c.Data(http.StatusOK, contentType, audioData)
|
||||
}
|
||||
|
||||
usage = &dto.Usage{
|
||||
PromptTokens: info.PromptTokens,
|
||||
CompletionTokens: 0,
|
||||
TotalTokens: minimaxResp.Usage.TotalTokens,
|
||||
TotalTokens: int(minimaxResp.ExtraInfo.UsageCharacters),
|
||||
}
|
||||
|
||||
return usage, nil
|
||||
|
||||
Reference in New Issue
Block a user