From 21cfc1ca3804d547082b4eb806042cb659816c73 Mon Sep 17 00:00:00 2001 From: CaIon Date: Sat, 28 Feb 2026 18:42:54 +0800 Subject: [PATCH] feat(gemini): update request structures for Veo predictLongRunning - Refactored the request URL and body construction methods to align with the Veo predictLongRunning endpoint. - Introduced new data structures for Veo instances and parameters, replacing the previous Gemini video generation configurations. - Updated the Vertex adaptor to utilize the new Veo request payload format. --- relay/channel/task/gemini/adaptor.go | 40 +++++++++++++------------ relay/channel/task/gemini/dto.go | 44 ++++++++++++++++------------ relay/channel/task/vertex/adaptor.go | 34 +++------------------ 3 files changed, 51 insertions(+), 67 deletions(-) diff --git a/relay/channel/task/gemini/adaptor.go b/relay/channel/task/gemini/adaptor.go index cbe3e85ad..48aa06319 100644 --- a/relay/channel/task/gemini/adaptor.go +++ b/relay/channel/task/gemini/adaptor.go @@ -44,13 +44,13 @@ func (a *TaskAdaptor) ValidateRequestAndSetAction(c *gin.Context, info *relaycom return relaycommon.ValidateBasicTaskRequest(c, info, constant.TaskActionTextGenerate) } -// BuildRequestURL constructs the Gemini API generateVideos endpoint. +// BuildRequestURL constructs the Gemini API predictLongRunning endpoint for Veo. func (a *TaskAdaptor) BuildRequestURL(info *relaycommon.RelayInfo) (string, error) { modelName := info.UpstreamModelName version := model_setting.GetGeminiVersionSetting(modelName) return fmt.Sprintf( - "%s/%s/models/%s:generateVideos", + "%s/%s/models/%s:predictLongRunning", a.baseURL, version, modelName, @@ -65,7 +65,7 @@ func (a *TaskAdaptor) BuildRequestHeader(c *gin.Context, req *http.Request, info return nil } -// BuildRequestBody converts request into the Gemini API generateVideos format. +// BuildRequestBody converts request into the Veo predictLongRunning format. func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayInfo) (io.Reader, error) { v, ok := c.Get("task_request") if !ok { @@ -76,34 +76,36 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn return nil, fmt.Errorf("unexpected task_request type") } - body := GeminiVideoPayload{ - Prompt: req.Prompt, - Config: &GeminiVideoGenerationConfig{}, - } - + instance := VeoInstance{Prompt: req.Prompt} if img := ExtractMultipartImage(c, info); img != nil { - body.Image = img + instance.Image = img } else if len(req.Images) > 0 { if parsed := ParseImageInput(req.Images[0]); parsed != nil { - body.Image = parsed + instance.Image = parsed info.Action = constant.TaskActionGenerate } } - if err := taskcommon.UnmarshalMetadata(req.Metadata, body.Config); err != nil { + params := &VeoParameters{} + if err := taskcommon.UnmarshalMetadata(req.Metadata, params); err != nil { return nil, errors.Wrap(err, "unmarshal metadata failed") } - if body.Config.DurationSeconds == 0 && req.Duration > 0 { - body.Config.DurationSeconds = req.Duration + if params.DurationSeconds == 0 && req.Duration > 0 { + params.DurationSeconds = req.Duration } - if body.Config.Resolution == "" && req.Size != "" { - body.Config.Resolution = SizeToVeoResolution(req.Size) + if params.Resolution == "" && req.Size != "" { + params.Resolution = SizeToVeoResolution(req.Size) } - if body.Config.AspectRatio == "" && req.Size != "" { - body.Config.AspectRatio = SizeToVeoAspectRatio(req.Size) + if params.AspectRatio == "" && req.Size != "" { + params.AspectRatio = SizeToVeoAspectRatio(req.Size) + } + params.Resolution = strings.ToLower(params.Resolution) + params.SampleCount = 1 + + body := VeoRequestPayload{ + Instances: []VeoInstance{instance}, + Parameters: params, } - body.Config.Resolution = strings.ToLower(body.Config.Resolution) - body.Config.NumberOfVideos = 1 data, err := common.Marshal(body) if err != nil { diff --git a/relay/channel/task/gemini/dto.go b/relay/channel/task/gemini/dto.go index b23e3e403..70a13feec 100644 --- a/relay/channel/task/gemini/dto.go +++ b/relay/channel/task/gemini/dto.go @@ -1,16 +1,5 @@ package gemini -// GeminiVideoGenerationConfig represents the Gemini API GenerateVideosConfig. -// Reference: https://ai.google.dev/gemini-api/docs/video -type GeminiVideoGenerationConfig struct { - AspectRatio string `json:"aspectRatio,omitempty"` - DurationSeconds int `json:"durationSeconds,omitempty"` - NegativePrompt string `json:"negativePrompt,omitempty"` - PersonGeneration string `json:"personGeneration,omitempty"` - Resolution string `json:"resolution,omitempty"` - NumberOfVideos int `json:"numberOfVideos,omitempty"` -} - // VeoImageInput represents an image input for Veo image-to-video. // Used by both Gemini and Vertex adaptors. type VeoImageInput struct { @@ -18,17 +7,36 @@ type VeoImageInput struct { MimeType string `json:"mimeType"` } -// GeminiVideoPayload is the top-level request body for the Gemini API -// models/{model}:generateVideos endpoint. -type GeminiVideoPayload struct { - Model string `json:"model,omitempty"` - Prompt string `json:"prompt"` - Image *VeoImageInput `json:"image,omitempty"` - Config *GeminiVideoGenerationConfig `json:"config,omitempty"` +// VeoInstance represents a single instance in the Veo predictLongRunning request. +type VeoInstance struct { + Prompt string `json:"prompt"` + Image *VeoImageInput `json:"image,omitempty"` // TODO: support referenceImages (style/asset references, up to 3 images) // TODO: support lastFrame (first+last frame interpolation, Veo 3.1) } +// VeoParameters represents the parameters block for Veo predictLongRunning. +type VeoParameters struct { + SampleCount int `json:"sampleCount"` + DurationSeconds int `json:"durationSeconds,omitempty"` + AspectRatio string `json:"aspectRatio,omitempty"` + Resolution string `json:"resolution,omitempty"` + NegativePrompt string `json:"negativePrompt,omitempty"` + PersonGeneration string `json:"personGeneration,omitempty"` + StorageUri string `json:"storageUri,omitempty"` + CompressionQuality string `json:"compressionQuality,omitempty"` + ResizeMode string `json:"resizeMode,omitempty"` + Seed *int `json:"seed,omitempty"` + GenerateAudio *bool `json:"generateAudio,omitempty"` +} + +// VeoRequestPayload is the top-level request body for the Veo +// predictLongRunning endpoint (used by both Gemini and Vertex). +type VeoRequestPayload struct { + Instances []VeoInstance `json:"instances"` + Parameters *VeoParameters `json:"parameters,omitempty"` +} + type submitResponse struct { Name string `json:"name"` } diff --git a/relay/channel/task/vertex/adaptor.go b/relay/channel/task/vertex/adaptor.go index 379d34744..b76364ee9 100644 --- a/relay/channel/task/vertex/adaptor.go +++ b/relay/channel/task/vertex/adaptor.go @@ -27,32 +27,6 @@ import ( // Request / Response structures // ============================ -type veoInstance struct { - Prompt string `json:"prompt"` - Image *geminitask.VeoImageInput `json:"image,omitempty"` - // TODO: support referenceImages (style/asset references, up to 3 images) - // TODO: support lastFrame (first+last frame interpolation, Veo 3.1) -} - -type veoParameters struct { - SampleCount int `json:"sampleCount"` - DurationSeconds int `json:"durationSeconds,omitempty"` - AspectRatio string `json:"aspectRatio,omitempty"` - Resolution string `json:"resolution,omitempty"` - NegativePrompt string `json:"negativePrompt,omitempty"` - PersonGeneration string `json:"personGeneration,omitempty"` - StorageUri string `json:"storageUri,omitempty"` - CompressionQuality string `json:"compressionQuality,omitempty"` - ResizeMode string `json:"resizeMode,omitempty"` - Seed *int `json:"seed,omitempty"` - GenerateAudio *bool `json:"generateAudio,omitempty"` -} - -type requestPayload struct { - Instances []veoInstance `json:"instances"` - Parameters *veoParameters `json:"parameters,omitempty"` -} - type fetchOperationPayload struct { OperationName string `json:"operationName"` } @@ -186,7 +160,7 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn } req := v.(relaycommon.TaskSubmitReq) - instance := veoInstance{Prompt: req.Prompt} + instance := geminitask.VeoInstance{Prompt: req.Prompt} if img := geminitask.ExtractMultipartImage(c, info); img != nil { instance.Image = img } else if len(req.Images) > 0 { @@ -196,7 +170,7 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn } } - params := &veoParameters{} + params := &geminitask.VeoParameters{} if err := taskcommon.UnmarshalMetadata(req.Metadata, params); err != nil { return nil, fmt.Errorf("unmarshal metadata failed: %w", err) } @@ -212,8 +186,8 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn params.Resolution = strings.ToLower(params.Resolution) params.SampleCount = 1 - body := requestPayload{ - Instances: []veoInstance{instance}, + body := geminitask.VeoRequestPayload{ + Instances: []geminitask.VeoInstance{instance}, Parameters: params, }