From fe68488b1cac9e179c8722bf8493193c5034f07a Mon Sep 17 00:00:00 2001 From: feitianbubu Date: Fri, 17 Oct 2025 18:45:29 +0800 Subject: [PATCH 1/5] feat: add doubao audio tts --- relay/channel/volcengine/adaptor.go | 63 +++++++++- relay/channel/volcengine/tts.go | 184 ++++++++++++++++++++++++++++ 2 files changed, 244 insertions(+), 3 deletions(-) create mode 100644 relay/channel/volcengine/tts.go diff --git a/relay/channel/volcengine/adaptor.go b/relay/channel/volcengine/adaptor.go index a428f08ce..21abe1d02 100644 --- a/relay/channel/volcengine/adaptor.go +++ b/relay/channel/volcengine/adaptor.go @@ -37,8 +37,49 @@ func (a *Adaptor) ConvertClaudeRequest(c *gin.Context, info *relaycommon.RelayIn } func (a *Adaptor) ConvertAudioRequest(c *gin.Context, info *relaycommon.RelayInfo, request dto.AudioRequest) (io.Reader, error) { - //TODO implement me - return nil, errors.New("not implemented") + if info.RelayMode != constant.RelayModeAudioSpeech { + return nil, errors.New("unsupported audio relay mode") + } + + appID, token, err := parseVolcengineAuth(info.ApiKey) + if err != nil { + return nil, err + } + + voiceType := mapVoiceType(request.Voice) + speedRatio := mapSpeedRatio(request.Speed) + encoding := mapEncoding(request.ResponseFormat) + + c.Set("response_format", encoding) + + volcRequest := VolcengineTTSRequest{ + App: VolcengineTTSApp{ + AppID: appID, + Token: token, + Cluster: "volcano_tts", + }, + User: VolcengineTTSUser{ + UID: "openai_relay_user", + }, + Audio: VolcengineTTSAudio{ + VoiceType: voiceType, + Encoding: encoding, + SpeedRatio: speedRatio, + Rate: 24000, + }, + Request: VolcengineTTSReqInfo{ + ReqID: generateRequestID(), + Text: request.Input, + Operation: "query", + }, + } + + jsonData, err := json.Marshal(volcRequest) + if err != nil { + return nil, fmt.Errorf("error marshalling volcengine request: %w", err) + } + + return bytes.NewReader(jsonData), nil } func (a *Adaptor) ConvertImageRequest(c *gin.Context, info *relaycommon.RelayInfo, request dto.ImageRequest) (any, error) { @@ -190,7 +231,6 @@ func (a *Adaptor) Init(info *relaycommon.RelayInfo) { } func (a *Adaptor) GetRequestURL(info *relaycommon.RelayInfo) (string, error) { - // 支持自定义域名,如果未设置则使用默认域名 baseUrl := info.ChannelBaseUrl if baseUrl == "" { baseUrl = channelconstant.ChannelBaseURLs[channelconstant.ChannelTypeVolcEngine] @@ -217,6 +257,8 @@ func (a *Adaptor) GetRequestURL(info *relaycommon.RelayInfo) (string, error) { return fmt.Sprintf("%s/api/v3/images/edits", baseUrl), nil case constant.RelayModeRerank: return fmt.Sprintf("%s/api/v3/rerank", baseUrl), nil + case constant.RelayModeAudioSpeech: + return "https://openspeech.bytedance.com/api/v1/tts", nil default: } } @@ -225,6 +267,16 @@ func (a *Adaptor) GetRequestURL(info *relaycommon.RelayInfo) (string, error) { func (a *Adaptor) SetupRequestHeader(c *gin.Context, req *http.Header, info *relaycommon.RelayInfo) error { channel.SetupApiRequestHeader(info, c, req) + + if info.RelayMode == constant.RelayModeAudioSpeech { + parts := strings.Split(info.ApiKey, "|") + if len(parts) == 2 { + req.Set("Authorization", "Bearer;"+parts[1]) + } + req.Set("Content-Type", "application/json") + return nil + } + req.Set("Authorization", "Bearer "+info.ApiKey) return nil } @@ -260,6 +312,11 @@ func (a *Adaptor) DoRequest(c *gin.Context, info *relaycommon.RelayInfo, request } func (a *Adaptor) DoResponse(c *gin.Context, resp *http.Response, info *relaycommon.RelayInfo) (usage any, err *types.NewAPIError) { + if info.RelayMode == constant.RelayModeAudioSpeech { + encoding := mapEncoding(c.GetString("response_format")) + return handleTTSResponse(c, resp, encoding) + } + adaptor := openai.Adaptor{} usage, err = adaptor.DoResponse(c, resp, info) return diff --git a/relay/channel/volcengine/tts.go b/relay/channel/volcengine/tts.go new file mode 100644 index 000000000..a61ad3405 --- /dev/null +++ b/relay/channel/volcengine/tts.go @@ -0,0 +1,184 @@ +package volcengine + +import ( + "encoding/base64" + "encoding/json" + "errors" + "io" + "net/http" + "strings" + + "github.com/QuantumNous/new-api/dto" + relaycommon "github.com/QuantumNous/new-api/relay/common" + "github.com/QuantumNous/new-api/types" + "github.com/gin-gonic/gin" + "github.com/google/uuid" +) + +type VolcengineTTSRequest struct { + App VolcengineTTSApp `json:"app"` + User VolcengineTTSUser `json:"user"` + Audio VolcengineTTSAudio `json:"audio"` + Request VolcengineTTSReqInfo `json:"request"` +} + +type VolcengineTTSApp struct { + AppID string `json:"appid"` + Token string `json:"token"` + Cluster string `json:"cluster"` +} + +type VolcengineTTSUser struct { + UID string `json:"uid"` +} + +type VolcengineTTSAudio struct { + VoiceType string `json:"voice_type"` + Encoding string `json:"encoding"` + SpeedRatio float64 `json:"speed_ratio"` + Rate int `json:"rate"` + LoudnessRatio float64 `json:"loudness_ratio,omitempty"` +} + +type VolcengineTTSReqInfo struct { + ReqID string `json:"reqid"` + Text string `json:"text"` + Operation string `json:"operation"` +} + +type VolcengineTTSResponse struct { + ReqID string `json:"reqid"` + Code int `json:"code"` + Message string `json:"message"` + Sequence int `json:"sequence"` + Data string `json:"data"` + Addition *VolcengineTTSAdditionInfo `json:"addition,omitempty"` +} + +type VolcengineTTSAdditionInfo struct { + Duration string `json:"duration"` +} + +var openAIToVolcengineVoiceMap = map[string]string{ + "alloy": "zh_male_M392_conversation_wvae_bigtts", + "echo": "zh_male_wenhao_mars_bigtts", + "fable": "zh_female_tianmei_mars_bigtts", + "onyx": "zh_male_zhibei_mars_bigtts", + "nova": "zh_female_shuangkuaisisi_mars_bigtts", + "shimmer": "zh_female_cancan_mars_bigtts", +} + +var responseFormatToEncodingMap = map[string]string{ + "mp3": "mp3", + "opus": "ogg_opus", + "aac": "mp3", + "flac": "mp3", + "wav": "wav", + "pcm": "pcm", +} + +func parseVolcengineAuth(apiKey string) (appID, token string, err error) { + parts := strings.Split(apiKey, "|") + if len(parts) != 2 { + return "", "", errors.New("invalid api key format, expected: appid:access_token") + } + return parts[0], parts[1], nil +} + +func mapVoiceType(openAIVoice string) string { + if voice, ok := openAIToVolcengineVoiceMap[openAIVoice]; ok { + return voice + } + return "zh_male_M392_conversation_wvae_bigtts" +} + +func mapSpeedRatio(speed float64) float64 { + if speed == 0 { + return 1.0 + } + if speed < 0.1 { + return 0.1 + } + if speed > 2.0 { + return 2.0 + } + return speed +} + +func mapEncoding(responseFormat string) string { + if responseFormat == "" { + return "mp3" + } + if encoding, ok := responseFormatToEncodingMap[responseFormat]; ok { + return encoding + } + return "mp3" +} + +func getContentTypeByEncoding(encoding string) string { + contentTypeMap := map[string]string{ + "mp3": "audio/mpeg", + "ogg_opus": "audio/ogg", + "wav": "audio/wav", + "pcm": "audio/pcm", + } + if ct, ok := contentTypeMap[encoding]; ok { + return ct + } + return "application/octet-stream" +} + +func handleTTSResponse(c *gin.Context, resp *http.Response, encoding string) (usage any, err *types.NewAPIError) { + body, readErr := io.ReadAll(resp.Body) + if readErr != nil { + return nil, types.NewErrorWithStatusCode( + errors.New("failed to read volcengine response"), + types.ErrorCodeReadResponseBodyFailed, + http.StatusInternalServerError, + ) + } + defer resp.Body.Close() + + var volcResp VolcengineTTSResponse + if unmarshalErr := json.Unmarshal(body, &volcResp); unmarshalErr != nil { + return nil, types.NewErrorWithStatusCode( + errors.New("failed to parse volcengine response"), + types.ErrorCodeBadResponseBody, + http.StatusInternalServerError, + ) + } + + if volcResp.Code != 3000 { + return nil, types.NewErrorWithStatusCode( + errors.New(volcResp.Message), + types.ErrorCodeBadResponse, + http.StatusBadRequest, + ) + } + + audioData, decodeErr := base64.StdEncoding.DecodeString(volcResp.Data) + if decodeErr != nil { + return nil, types.NewErrorWithStatusCode( + errors.New("failed to decode audio data"), + types.ErrorCodeBadResponseBody, + http.StatusInternalServerError, + ) + } + + contentType := getContentTypeByEncoding(encoding) + c.Header("Content-Type", contentType) + c.Data(http.StatusOK, contentType, audioData) + + info := c.MustGet("relay_info").(*relaycommon.RelayInfo) + usage = &dto.Usage{ + PromptTokens: info.PromptTokens, + CompletionTokens: 0, + TotalTokens: info.PromptTokens, + } + + return usage, nil +} + +func generateRequestID() string { + return uuid.New().String() +} From 58c4da0ddf5784e7f9e1f934aeb601b2a5c888a0 Mon Sep 17 00:00:00 2001 From: feitianbubu Date: Fri, 17 Oct 2025 20:32:00 +0800 Subject: [PATCH 2/5] feat: switch to official TTS only when baseUrl is Volcano's official URL --- relay/channel/volcengine/adaptor.go | 7 ++++- relay/channel/volcengine/tts.go | 45 +++++++++++++++++++++++------ 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/relay/channel/volcengine/adaptor.go b/relay/channel/volcengine/adaptor.go index 21abe1d02..ec0bf6e22 100644 --- a/relay/channel/volcengine/adaptor.go +++ b/relay/channel/volcengine/adaptor.go @@ -71,6 +71,7 @@ func (a *Adaptor) ConvertAudioRequest(c *gin.Context, info *relaycommon.RelayInf ReqID: generateRequestID(), Text: request.Input, Operation: "query", + Model: info.OriginModelName, }, } @@ -258,7 +259,11 @@ func (a *Adaptor) GetRequestURL(info *relaycommon.RelayInfo) (string, error) { case constant.RelayModeRerank: return fmt.Sprintf("%s/api/v3/rerank", baseUrl), nil case constant.RelayModeAudioSpeech: - return "https://openspeech.bytedance.com/api/v1/tts", nil + // 只有当 baseUrl 是火山默认的官方Url时才改为官方的的TTS接口,否则走透传的New接口 + if baseUrl == channelconstant.ChannelBaseURLs[channelconstant.ChannelTypeVolcEngine] { + return "https://openspeech.bytedance.com/api/v1/tts", nil + } + return fmt.Sprintf("%s/v1/audio/speech", baseUrl), nil default: } } diff --git a/relay/channel/volcengine/tts.go b/relay/channel/volcengine/tts.go index a61ad3405..98ac0e038 100644 --- a/relay/channel/volcengine/tts.go +++ b/relay/channel/volcengine/tts.go @@ -33,17 +33,44 @@ type VolcengineTTSUser struct { } type VolcengineTTSAudio struct { - VoiceType string `json:"voice_type"` - Encoding string `json:"encoding"` - SpeedRatio float64 `json:"speed_ratio"` - Rate int `json:"rate"` - LoudnessRatio float64 `json:"loudness_ratio,omitempty"` + VoiceType string `json:"voice_type"` + Encoding string `json:"encoding"` + SpeedRatio float64 `json:"speed_ratio"` + Rate int `json:"rate"` + Bitrate int `json:"bitrate,omitempty"` + LoudnessRatio float64 `json:"loudness_ratio,omitempty"` + EnableEmotion bool `json:"enable_emotion,omitempty"` + Emotion string `json:"emotion,omitempty"` + EmotionScale float64 `json:"emotion_scale,omitempty"` + ExplicitLanguage string `json:"explicit_language,omitempty"` + ContextLanguage string `json:"context_language,omitempty"` } type VolcengineTTSReqInfo struct { - ReqID string `json:"reqid"` - Text string `json:"text"` - Operation string `json:"operation"` + ReqID string `json:"reqid"` + Text string `json:"text"` + Operation string `json:"operation"` + Model string `json:"model,omitempty"` + TextType string `json:"text_type,omitempty"` + SilenceDuration float64 `json:"silence_duration,omitempty"` + WithTimestamp interface{} `json:"with_timestamp,omitempty"` + ExtraParam *VolcengineTTSExtraParam `json:"extra_param,omitempty"` +} + +type VolcengineTTSExtraParam struct { + DisableMarkdownFilter bool `json:"disable_markdown_filter,omitempty"` + EnableLatexTn bool `json:"enable_latex_tn,omitempty"` + MuteCutThreshold string `json:"mute_cut_threshold,omitempty"` + MuteCutRemainMs string `json:"mute_cut_remain_ms,omitempty"` + DisableEmojiFilter bool `json:"disable_emoji_filter,omitempty"` + UnsupportedCharRatioThresh float64 `json:"unsupported_char_ratio_thresh,omitempty"` + AigcWatermark bool `json:"aigc_watermark,omitempty"` + CacheConfig *VolcengineTTSCacheConfig `json:"cache_config,omitempty"` +} + +type VolcengineTTSCacheConfig struct { + TextType int `json:"text_type,omitempty"` + UseCache bool `json:"use_cache,omitempty"` } type VolcengineTTSResponse struct { @@ -89,7 +116,7 @@ func mapVoiceType(openAIVoice string) string { if voice, ok := openAIToVolcengineVoiceMap[openAIVoice]; ok { return voice } - return "zh_male_M392_conversation_wvae_bigtts" + return openAIVoice } func mapSpeedRatio(speed float64) float64 { From 735386c0b9ca36d9c7d11ef075adc6973a564a5d Mon Sep 17 00:00:00 2001 From: feitianbubu Date: Fri, 17 Oct 2025 21:33:08 +0800 Subject: [PATCH 3/5] feat: add doubao tts usage token --- relay/channel/volcengine/adaptor.go | 2 +- relay/channel/volcengine/tts.go | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/relay/channel/volcengine/adaptor.go b/relay/channel/volcengine/adaptor.go index ec0bf6e22..94cb776e8 100644 --- a/relay/channel/volcengine/adaptor.go +++ b/relay/channel/volcengine/adaptor.go @@ -319,7 +319,7 @@ func (a *Adaptor) DoRequest(c *gin.Context, info *relaycommon.RelayInfo, request func (a *Adaptor) DoResponse(c *gin.Context, resp *http.Response, info *relaycommon.RelayInfo) (usage any, err *types.NewAPIError) { if info.RelayMode == constant.RelayModeAudioSpeech { encoding := mapEncoding(c.GetString("response_format")) - return handleTTSResponse(c, resp, encoding) + return handleTTSResponse(c, resp, info, encoding) } adaptor := openai.Adaptor{} diff --git a/relay/channel/volcengine/tts.go b/relay/channel/volcengine/tts.go index 98ac0e038..dd6199715 100644 --- a/relay/channel/volcengine/tts.go +++ b/relay/channel/volcengine/tts.go @@ -119,6 +119,7 @@ func mapVoiceType(openAIVoice string) string { return openAIVoice } +// [0.1,2],默认为 1,通常保留一位小数即可 func mapSpeedRatio(speed float64) float64 { if speed == 0 { return 1.0 @@ -133,9 +134,6 @@ func mapSpeedRatio(speed float64) float64 { } func mapEncoding(responseFormat string) string { - if responseFormat == "" { - return "mp3" - } if encoding, ok := responseFormatToEncodingMap[responseFormat]; ok { return encoding } @@ -155,7 +153,7 @@ func getContentTypeByEncoding(encoding string) string { return "application/octet-stream" } -func handleTTSResponse(c *gin.Context, resp *http.Response, encoding string) (usage any, err *types.NewAPIError) { +func handleTTSResponse(c *gin.Context, resp *http.Response, info *relaycommon.RelayInfo, encoding string) (usage any, err *types.NewAPIError) { body, readErr := io.ReadAll(resp.Body) if readErr != nil { return nil, types.NewErrorWithStatusCode( @@ -196,7 +194,6 @@ func handleTTSResponse(c *gin.Context, resp *http.Response, encoding string) (us c.Header("Content-Type", contentType) c.Data(http.StatusOK, contentType, audioData) - info := c.MustGet("relay_info").(*relaycommon.RelayInfo) usage = &dto.Usage{ PromptTokens: info.PromptTokens, CompletionTokens: 0, From 8965fc49c951f1aaea63aa4bc5fedfd2e5813403 Mon Sep 17 00:00:00 2001 From: feitianbubu Date: Fri, 17 Oct 2025 22:05:58 +0800 Subject: [PATCH 4/5] feat: add doubao audio token input prompt --- web/src/components/table/channels/modals/EditChannelModal.jsx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/web/src/components/table/channels/modals/EditChannelModal.jsx b/web/src/components/table/channels/modals/EditChannelModal.jsx index 4c4229358..c9c284abc 100644 --- a/web/src/components/table/channels/modals/EditChannelModal.jsx +++ b/web/src/components/table/channels/modals/EditChannelModal.jsx @@ -107,6 +107,8 @@ function type2secretPrompt(type) { return '按照如下格式输入:AppId|SecretId|SecretKey'; case 33: return '按照如下格式输入:Ak|Sk|Region'; + case 45: + return '请输入渠道对应的鉴权密钥, 豆包语音输入:AppId|AccessToken'; case 50: return '按照如下格式输入: AccessKey|SecretKey, 如果上游是New API,则直接输ApiKey'; case 51: From 6a24c37c0ed6d437e378f301eb65bd70c21e5d92 Mon Sep 17 00:00:00 2001 From: IcedTangerine Date: Sat, 18 Oct 2025 00:13:28 +0800 Subject: [PATCH 5/5] Fix error message for invalid API key format --- relay/channel/volcengine/tts.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relay/channel/volcengine/tts.go b/relay/channel/volcengine/tts.go index dd6199715..4cd591396 100644 --- a/relay/channel/volcengine/tts.go +++ b/relay/channel/volcengine/tts.go @@ -107,7 +107,7 @@ var responseFormatToEncodingMap = map[string]string{ func parseVolcengineAuth(apiKey string) (appID, token string, err error) { parts := strings.Split(apiKey, "|") if len(parts) != 2 { - return "", "", errors.New("invalid api key format, expected: appid:access_token") + return "", "", errors.New("invalid api key format, expected: appid|access_token") } return parts[0], parts[1], nil }