From c7ab0f4f3dd4a46623209d68b21939aae7e97750 Mon Sep 17 00:00:00 2001 From: feitianbubu Date: Mon, 20 Oct 2025 14:46:04 +0800 Subject: [PATCH] feat: opt minimax tts req struct --- relay/channel/minimax/adaptor.go | 30 ++--- relay/channel/minimax/tts.go | 210 ++++++++++++++----------------- 2 files changed, 112 insertions(+), 128 deletions(-) diff --git a/relay/channel/minimax/adaptor.go b/relay/channel/minimax/adaptor.go index cba2271cf..fac0b9a9b 100644 --- a/relay/channel/minimax/adaptor.go +++ b/relay/channel/minimax/adaptor.go @@ -34,17 +34,20 @@ func (a *Adaptor) ConvertAudioRequest(c *gin.Context, info *relaycommon.RelayInf return nil, errors.New("unsupported audio relay mode") } - voiceID := mapVoiceType(request.Voice) + voiceID := request.Voice speed := request.Speed - outputFormat := mapOutputFormat(request.ResponseFormat) - - c.Set("response_format", outputFormat) + outputFormat := request.ResponseFormat minimaxRequest := MiniMaxTTSRequest{ - Model: getTTSModel(info.OriginModelName), - Text: request.Input, - VoiceID: voiceID, - Speed: speed, + Model: info.OriginModelName, + Text: request.Input, + VoiceSetting: VoiceSetting{ + VoiceID: voiceID, + Speed: speed, + }, + AudioSetting: &AudioSetting{ + Format: outputFormat, + }, OutputFormat: outputFormat, } @@ -59,6 +62,11 @@ func (a *Adaptor) ConvertAudioRequest(c *gin.Context, info *relaycommon.RelayInf if err != nil { return nil, fmt.Errorf("error marshalling minimax request: %w", err) } + if outputFormat != "hex" { + outputFormat = "url" + } + + c.Set("response_format", outputFormat) // Debug: log the request structure fmt.Printf("MiniMax TTS Request: %s\n", string(jsonData)) @@ -79,12 +87,6 @@ func (a *Adaptor) GetRequestURL(info *relaycommon.RelayInfo) (string, error) { func (a *Adaptor) SetupRequestHeader(c *gin.Context, req *http.Header, info *relaycommon.RelayInfo) error { channel.SetupApiRequestHeader(info, c, req) - - if info.RelayMode == constant.RelayModeAudioSpeech { - req.Set("Content-Type", "application/json") - return nil - } - req.Set("Authorization", "Bearer "+info.ApiKey) return nil } diff --git a/relay/channel/minimax/tts.go b/relay/channel/minimax/tts.go index a8925ee85..4a52d2145 100644 --- a/relay/channel/minimax/tts.go +++ b/relay/channel/minimax/tts.go @@ -1,11 +1,13 @@ package minimax import ( - "encoding/base64" + "encoding/hex" "encoding/json" "errors" + "fmt" "io" "net/http" + "strings" "github.com/QuantumNous/new-api/dto" relaycommon "github.com/QuantumNous/new-api/relay/common" @@ -14,96 +16,78 @@ import ( ) type MiniMaxTTSRequest struct { - Model string `json:"model"` - Text string `json:"text"` - VoiceID string `json:"voice_id"` - Speed float64 `json:"speed,omitempty"` - Vol float64 `json:"vol,omitempty"` - Pitch int `json:"pitch,omitempty"` - AudioSampleRate int `json:"audio_sample_rate,omitempty"` - OutputFormat string `json:"output_format,omitempty"` + Model string `json:"model"` + Text string `json:"text"` + Stream bool `json:"stream,omitempty"` + StreamOptions *StreamOptions `json:"stream_options,omitempty"` + VoiceSetting VoiceSetting `json:"voice_setting"` + PronunciationDict *PronunciationDict `json:"pronunciation_dict,omitempty"` + AudioSetting *AudioSetting `json:"audio_setting,omitempty"` + TimbreWeights []TimbreWeight `json:"timbre_weights,omitempty"` + LanguageBoost string `json:"language_boost,omitempty"` + VoiceModify *VoiceModify `json:"voice_modify,omitempty"` + SubtitleEnable bool `json:"subtitle_enable,omitempty"` + OutputFormat string `json:"output_format,omitempty"` + AigcWatermark bool `json:"aigc_watermark,omitempty"` +} + +type StreamOptions struct { + ExcludeAggregatedAudio bool `json:"exclude_aggregated_audio,omitempty"` +} + +type VoiceSetting struct { + VoiceID string `json:"voice_id"` + Speed float64 `json:"speed,omitempty"` + Vol float64 `json:"vol,omitempty"` + Pitch int `json:"pitch,omitempty"` + Emotion string `json:"emotion,omitempty"` + TextNormalization bool `json:"text_normalization,omitempty"` + LatexRead bool `json:"latex_read,omitempty"` +} + +type PronunciationDict struct { + Tone []string `json:"tone,omitempty"` +} + +type AudioSetting struct { + SampleRate int `json:"sample_rate,omitempty"` + Bitrate int `json:"bitrate,omitempty"` + Format string `json:"format,omitempty"` + Channel int `json:"channel,omitempty"` + ForceCbr bool `json:"force_cbr,omitempty"` +} + +type TimbreWeight struct { + VoiceID string `json:"voice_id"` + Weight int `json:"weight"` +} + +type VoiceModify struct { + Pitch int `json:"pitch,omitempty"` + Intensity int `json:"intensity,omitempty"` + Timbre int `json:"timbre,omitempty"` + SoundEffects string `json:"sound_effects,omitempty"` } type MiniMaxTTSResponse struct { - Created int `json:"created"` - Data []MiniMaxTTSData `json:"data"` - ID string `json:"id"` - Model string `json:"model"` - Object string `json:"object"` - Usage MiniMaxTTSUsage `json:"usage"` + Data MiniMaxTTSData `json:"data"` + ExtraInfo MiniMaxExtraInfo `json:"extra_info"` + TraceID string `json:"trace_id"` + BaseResp MiniMaxBaseResp `json:"base_resp"` } type MiniMaxTTSData struct { - Index int `json:"index"` - Audio string `json:"audio"` - Text string `json:"text"` - FinishReason string `json:"finish_reason"` + Audio string `json:"audio"` + Status int `json:"status"` } -type MiniMaxTTSUsage struct { - TotalTokens int `json:"total_tokens"` +type MiniMaxExtraInfo struct { + UsageCharacters int64 `json:"usage_characters"` } -type MiniMaxTTSErrorResponse struct { - Error MiniMaxTTSError `json:"error"` -} - -type MiniMaxTTSError struct { - Code string `json:"code"` - Message string `json:"message"` - Type string `json:"type"` -} - -// OpenAI voice to MiniMax voice_id mapping -var openAIToMiniMaxVoiceMap = map[string]string{ - "alloy": "male-qn-qingse", - "echo": "male-qn-jingying", - "fable": "female-shaonv", - "onyx": "male-qn-badao", - "nova": "female-shaonv-jingpin", - "shimmer": "female-yujie", - // Add some standard MiniMax voice IDs - "voice-1": "male-qn-qingse", - "voice-2": "female-shaonv", -} - -// OpenAI response format to MiniMax output format mapping -var responseFormatToOutputFormatMap = map[string]string{ - "mp3": "mp3", - "opus": "mp3", - "aac": "aac", - "flac": "flac", - "wav": "wav", - "pcm": "pcm", -} - -// TTS model mapping - MiniMax uses speech-01 or speech-01-turbo -var modelToTTSModelMap = map[string]string{ - "speech-01": "speech-01", - "speech-01-turbo": "speech-01-turbo", - "tts-1": "speech-01-turbo", - "tts-1-hd": "speech-01", -} - -func mapVoiceType(openAIVoice string) string { - if voice, ok := openAIToMiniMaxVoiceMap[openAIVoice]; ok { - return voice - } - return "female-shaonv" // default voice -} - -func mapOutputFormat(responseFormat string) string { - if format, ok := responseFormatToOutputFormatMap[responseFormat]; ok { - return format - } - return "mp3" // default format -} - -func getTTSModel(modelName string) string { - if ttsModel, ok := modelToTTSModelMap[modelName]; ok { - return ttsModel - } - return "speech-01-turbo" // default model +type MiniMaxBaseResp struct { + StatusCode int64 `json:"status_code"` + StatusMsg string `json:"status_msg"` } func getContentTypeByFormat(format string) string { @@ -124,66 +108,64 @@ func handleTTSResponse(c *gin.Context, resp *http.Response, info *relaycommon.Re body, readErr := io.ReadAll(resp.Body) if readErr != nil { return nil, types.NewErrorWithStatusCode( - errors.New("failed to read minimax response"), + fmt.Errorf("failed to read minimax response: %w", readErr), types.ErrorCodeReadResponseBodyFailed, http.StatusInternalServerError, ) } defer resp.Body.Close() - // First try to parse as error response - var errorResp MiniMaxTTSErrorResponse - if unmarshalErr := json.Unmarshal(body, &errorResp); unmarshalErr == nil && errorResp.Error.Code != "" { - return nil, types.NewErrorWithStatusCode( - errors.New(errorResp.Error.Message), - types.ErrorCodeBadResponse, - http.StatusBadRequest, - ) - } - - // Parse as successful response + // Parse response var minimaxResp MiniMaxTTSResponse if unmarshalErr := json.Unmarshal(body, &minimaxResp); unmarshalErr != nil { return nil, types.NewErrorWithStatusCode( - errors.New("failed to parse minimax response"), + fmt.Errorf("failed to unmarshal minimax TTS response: %w", unmarshalErr), types.ErrorCodeBadResponseBody, http.StatusInternalServerError, ) } + // Check base_resp status code + if minimaxResp.BaseResp.StatusCode != 0 { + return nil, types.NewErrorWithStatusCode( + fmt.Errorf("minimax TTS error: %d - %s", minimaxResp.BaseResp.StatusCode, minimaxResp.BaseResp.StatusMsg), + types.ErrorCodeBadResponse, + http.StatusBadRequest, + ) + } + // Check if we have audio data - if len(minimaxResp.Data) == 0 || minimaxResp.Data[0].Audio == "" { + if minimaxResp.Data.Audio == "" { return nil, types.NewErrorWithStatusCode( - errors.New("no audio data in response"), + fmt.Errorf("no audio data in minimax TTS response"), types.ErrorCodeBadResponse, http.StatusBadRequest, ) } - // Decode base64 audio data - audioData, decodeErr := base64.StdEncoding.DecodeString(minimaxResp.Data[0].Audio) - if decodeErr != nil { - return nil, types.NewErrorWithStatusCode( - errors.New("failed to decode audio data"), - types.ErrorCodeBadResponseBody, - http.StatusInternalServerError, - ) - } + if strings.HasPrefix(minimaxResp.Data.Audio, "http") { + c.Redirect(http.StatusFound, minimaxResp.Data.Audio) + } else { + // Handle hex-encoded audio data + audioData, decodeErr := hex.DecodeString(minimaxResp.Data.Audio) + if decodeErr != nil { + return nil, types.NewErrorWithStatusCode( + fmt.Errorf("failed to decode hex audio data: %w", decodeErr), + types.ErrorCodeBadResponse, + http.StatusInternalServerError, + ) + } - // Get output format from context or default to mp3 - outputFormat := c.GetString("response_format") - if outputFormat == "" { - outputFormat = "mp3" - } + // Determine content type - default to mp3 + contentType := "audio/mpeg" - contentType := getContentTypeByFormat(outputFormat) - c.Header("Content-Type", contentType) - c.Data(http.StatusOK, contentType, audioData) + c.Data(http.StatusOK, contentType, audioData) + } usage = &dto.Usage{ PromptTokens: info.PromptTokens, CompletionTokens: 0, - TotalTokens: minimaxResp.Usage.TotalTokens, + TotalTokens: int(minimaxResp.ExtraInfo.UsageCharacters), } return usage, nil