feat(gemini): implement video generation configuration and billing estimation

- Added Gemini video generation configuration structures and payloads.
- Introduced functions for parsing and resolving video duration and resolution from metadata.
- Enhanced the Vertex adaptor to support Gemini video generation requests and billing estimation based on duration and resolution.
- Updated model pricing settings for new Gemini video models.
This commit is contained in:
CaIon
2026-02-28 17:37:08 +08:00
parent d1f2b707e3
commit 2189fd8f3e
7 changed files with 437 additions and 118 deletions

View File

@@ -22,64 +22,6 @@ import (
"github.com/pkg/errors"
)
// ============================
// Request / Response structures
// ============================
// GeminiVideoGenerationConfig represents the video generation configuration
// Based on: https://ai.google.dev/gemini-api/docs/video
type GeminiVideoGenerationConfig struct {
AspectRatio string `json:"aspectRatio,omitempty"` // "16:9" or "9:16"
DurationSeconds float64 `json:"durationSeconds,omitempty"` // 4, 6, or 8 (as number)
NegativePrompt string `json:"negativePrompt,omitempty"` // unwanted elements
PersonGeneration string `json:"personGeneration,omitempty"` // "allow_all" for text-to-video, "allow_adult" for image-to-video
Resolution string `json:"resolution,omitempty"` // video resolution
}
// GeminiVideoRequest represents a single video generation instance
type GeminiVideoRequest struct {
Prompt string `json:"prompt"`
}
// GeminiVideoPayload represents the complete video generation request payload
type GeminiVideoPayload struct {
Instances []GeminiVideoRequest `json:"instances"`
Parameters GeminiVideoGenerationConfig `json:"parameters,omitempty"`
}
type submitResponse struct {
Name string `json:"name"`
}
type operationVideo struct {
MimeType string `json:"mimeType"`
BytesBase64Encoded string `json:"bytesBase64Encoded"`
Encoding string `json:"encoding"`
}
type operationResponse struct {
Name string `json:"name"`
Done bool `json:"done"`
Response struct {
Type string `json:"@type"`
RaiMediaFilteredCount int `json:"raiMediaFilteredCount"`
Videos []operationVideo `json:"videos"`
BytesBase64Encoded string `json:"bytesBase64Encoded"`
Encoding string `json:"encoding"`
Video string `json:"video"`
GenerateVideoResponse struct {
GeneratedSamples []struct {
Video struct {
URI string `json:"uri"`
} `json:"video"`
} `json:"generatedSamples"`
} `json:"generateVideoResponse"`
} `json:"response"`
Error struct {
Message string `json:"message"`
} `json:"error"`
}
// ============================
// Adaptor implementation
// ============================
@@ -99,17 +41,16 @@ func (a *TaskAdaptor) Init(info *relaycommon.RelayInfo) {
// ValidateRequestAndSetAction parses body, validates fields and sets default action.
func (a *TaskAdaptor) ValidateRequestAndSetAction(c *gin.Context, info *relaycommon.RelayInfo) (taskErr *dto.TaskError) {
// Use the standard validation method for TaskSubmitReq
return relaycommon.ValidateBasicTaskRequest(c, info, constant.TaskActionTextGenerate)
}
// BuildRequestURL constructs the upstream URL.
// BuildRequestURL constructs the Gemini API generateVideos endpoint.
func (a *TaskAdaptor) BuildRequestURL(info *relaycommon.RelayInfo) (string, error) {
modelName := info.UpstreamModelName
version := model_setting.GetGeminiVersionSetting(modelName)
return fmt.Sprintf(
"%s/%s/models/%s:predictLongRunning",
"%s/%s/models/%s:generateVideos",
a.baseURL,
version,
modelName,
@@ -124,7 +65,7 @@ func (a *TaskAdaptor) BuildRequestHeader(c *gin.Context, req *http.Request, info
return nil
}
// BuildRequestBody converts request into Gemini specific format.
// BuildRequestBody converts request into the Gemini API generateVideos format.
func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayInfo) (io.Reader, error) {
v, ok := c.Get("task_request")
if !ok {
@@ -135,18 +76,34 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn
return nil, fmt.Errorf("unexpected task_request type")
}
// Create structured video generation request
body := GeminiVideoPayload{
Instances: []GeminiVideoRequest{
{Prompt: req.Prompt},
},
Parameters: GeminiVideoGenerationConfig{},
Prompt: req.Prompt,
Config: &GeminiVideoGenerationConfig{},
}
metadata := req.Metadata
if err := taskcommon.UnmarshalMetadata(metadata, &body.Parameters); err != nil {
if img := ExtractMultipartImage(c, info); img != nil {
body.Image = img
} else if len(req.Images) > 0 {
if parsed := ParseImageInput(req.Images[0]); parsed != nil {
body.Image = parsed
info.Action = constant.TaskActionGenerate
}
}
if err := taskcommon.UnmarshalMetadata(req.Metadata, body.Config); err != nil {
return nil, errors.Wrap(err, "unmarshal metadata failed")
}
if body.Config.DurationSeconds == 0 && req.Duration > 0 {
body.Config.DurationSeconds = req.Duration
}
if body.Config.Resolution == "" && req.Size != "" {
body.Config.Resolution = SizeToVeoResolution(req.Size)
}
if body.Config.AspectRatio == "" && req.Size != "" {
body.Config.AspectRatio = SizeToVeoAspectRatio(req.Size)
}
body.Config.Resolution = strings.ToLower(body.Config.Resolution)
body.Config.NumberOfVideos = 1
data, err := common.Marshal(body)
if err != nil {
@@ -186,14 +143,40 @@ func (a *TaskAdaptor) DoResponse(c *gin.Context, resp *http.Response, info *rela
}
func (a *TaskAdaptor) GetModelList() []string {
return []string{"veo-3.0-generate-001", "veo-3.1-generate-preview", "veo-3.1-fast-generate-preview"}
return []string{
"veo-3.0-generate-001",
"veo-3.0-fast-generate-001",
"veo-3.1-generate-preview",
"veo-3.1-fast-generate-preview",
}
}
func (a *TaskAdaptor) GetChannelName() string {
return "gemini"
}
// FetchTask fetch task status
// EstimateBilling returns OtherRatios based on durationSeconds and resolution.
func (a *TaskAdaptor) EstimateBilling(c *gin.Context, info *relaycommon.RelayInfo) map[string]float64 {
v, ok := c.Get("task_request")
if !ok {
return nil
}
req, ok := v.(relaycommon.TaskSubmitReq)
if !ok {
return nil
}
seconds := ResolveVeoDuration(req.Metadata, req.Duration, req.Seconds)
resolution := ResolveVeoResolution(req.Metadata, req.Size)
resRatio := VeoResolutionRatio(info.UpstreamModelName, resolution)
return map[string]float64{
"seconds": float64(seconds),
"resolution": resRatio,
}
}
// FetchTask polls task status via the Gemini operations GET endpoint.
func (a *TaskAdaptor) FetchTask(baseUrl, key string, body map[string]any, proxy string) (*http.Response, error) {
taskID, ok := body["task_id"].(string)
if !ok {
@@ -205,7 +188,6 @@ func (a *TaskAdaptor) FetchTask(baseUrl, key string, body map[string]any, proxy
return nil, fmt.Errorf("decode task_id failed: %w", err)
}
// For Gemini API, we use GET request to the operations endpoint
version := model_setting.GetGeminiVersionSetting("default")
url := fmt.Sprintf("%s/%s/%s", baseUrl, version, upstreamName)
@@ -249,11 +231,9 @@ func (a *TaskAdaptor) ParseTaskResult(respBody []byte) (*relaycommon.TaskInfo, e
ti.Progress = "100%"
ti.TaskID = taskcommon.EncodeLocalTaskID(op.Name)
// Url intentionally left empty — the caller constructs the proxy URL using the public task ID
// Extract URL from generateVideoResponse if available
if len(op.Response.GenerateVideoResponse.GeneratedSamples) > 0 {
if uri := op.Response.GenerateVideoResponse.GeneratedSamples[0].Video.URI; uri != "" {
if len(op.Response.GenerateVideoResponse.GeneratedVideos) > 0 {
if uri := op.Response.GenerateVideoResponse.GeneratedVideos[0].Video.URI; uri != "" {
ti.RemoteUrl = uri
}
}
@@ -262,8 +242,6 @@ func (a *TaskAdaptor) ParseTaskResult(respBody []byte) (*relaycommon.TaskInfo, e
}
func (a *TaskAdaptor) ConvertToOpenAIVideo(task *model.Task) ([]byte, error) {
// Use GetUpstreamTaskID() to get the real upstream operation name for model extraction.
// task.TaskID is now a public task_xxxx ID, no longer a base64-encoded upstream name.
upstreamTaskID := task.GetUpstreamTaskID()
upstreamName, err := taskcommon.DecodeLocalTaskID(upstreamTaskID)
if err != nil {

View File

@@ -0,0 +1,138 @@
package gemini
import (
"strconv"
"strings"
)
// ParseVeoDurationSeconds extracts durationSeconds from metadata.
// Returns 8 (Veo default) when not specified or invalid.
func ParseVeoDurationSeconds(metadata map[string]any) int {
if metadata == nil {
return 8
}
v, ok := metadata["durationSeconds"]
if !ok {
return 8
}
switch n := v.(type) {
case float64:
if int(n) > 0 {
return int(n)
}
case int:
if n > 0 {
return n
}
}
return 8
}
// ParseVeoResolution extracts resolution from metadata.
// Returns "720p" when not specified.
func ParseVeoResolution(metadata map[string]any) string {
if metadata == nil {
return "720p"
}
v, ok := metadata["resolution"]
if !ok {
return "720p"
}
if s, ok := v.(string); ok && s != "" {
return strings.ToLower(s)
}
return "720p"
}
// ResolveVeoDuration returns the effective duration in seconds.
// Priority: metadata["durationSeconds"] > stdDuration > stdSeconds > default (8).
func ResolveVeoDuration(metadata map[string]any, stdDuration int, stdSeconds string) int {
if metadata != nil {
if _, exists := metadata["durationSeconds"]; exists {
if d := ParseVeoDurationSeconds(metadata); d > 0 {
return d
}
}
}
if stdDuration > 0 {
return stdDuration
}
if s, err := strconv.Atoi(stdSeconds); err == nil && s > 0 {
return s
}
return 8
}
// ResolveVeoResolution returns the effective resolution string (lowercase).
// Priority: metadata["resolution"] > SizeToVeoResolution(stdSize) > default ("720p").
func ResolveVeoResolution(metadata map[string]any, stdSize string) string {
if metadata != nil {
if _, exists := metadata["resolution"]; exists {
if r := ParseVeoResolution(metadata); r != "" {
return r
}
}
}
if stdSize != "" {
return SizeToVeoResolution(stdSize)
}
return "720p"
}
// SizeToVeoResolution converts a "WxH" size string to a Veo resolution label.
func SizeToVeoResolution(size string) string {
parts := strings.SplitN(strings.ToLower(size), "x", 2)
if len(parts) != 2 {
return "720p"
}
w, _ := strconv.Atoi(parts[0])
h, _ := strconv.Atoi(parts[1])
maxDim := w
if h > maxDim {
maxDim = h
}
if maxDim >= 3840 {
return "4k"
}
if maxDim >= 1920 {
return "1080p"
}
return "720p"
}
// SizeToVeoAspectRatio converts a "WxH" size string to a Veo aspect ratio.
func SizeToVeoAspectRatio(size string) string {
parts := strings.SplitN(strings.ToLower(size), "x", 2)
if len(parts) != 2 {
return "16:9"
}
w, _ := strconv.Atoi(parts[0])
h, _ := strconv.Atoi(parts[1])
if w <= 0 || h <= 0 {
return "16:9"
}
if h > w {
return "9:16"
}
return "16:9"
}
// VeoResolutionRatio returns the pricing multiplier for the given resolution.
// Standard resolutions (720p, 1080p) return 1.0.
// 4K returns a model-specific multiplier based on Google's official pricing.
func VeoResolutionRatio(modelName, resolution string) float64 {
if resolution != "4k" {
return 1.0
}
// 4K multipliers derived from Vertex AI official pricing (video+audio base):
// veo-3.1-generate: $0.60 / $0.40 = 1.5
// veo-3.1-fast-generate: $0.35 / $0.15 ≈ 2.333
// Veo 3.0 models do not support 4K; return 1.0 as fallback.
if strings.Contains(modelName, "3.1-fast-generate") {
return 2.333333
}
if strings.Contains(modelName, "3.1-generate") || strings.Contains(modelName, "3.1") {
return 1.5
}
return 1.0
}

View File

@@ -0,0 +1,63 @@
package gemini
// GeminiVideoGenerationConfig represents the Gemini API GenerateVideosConfig.
// Reference: https://ai.google.dev/gemini-api/docs/video
type GeminiVideoGenerationConfig struct {
AspectRatio string `json:"aspectRatio,omitempty"`
DurationSeconds int `json:"durationSeconds,omitempty"`
NegativePrompt string `json:"negativePrompt,omitempty"`
PersonGeneration string `json:"personGeneration,omitempty"`
Resolution string `json:"resolution,omitempty"`
NumberOfVideos int `json:"numberOfVideos,omitempty"`
}
// VeoImageInput represents an image input for Veo image-to-video.
// Used by both Gemini and Vertex adaptors.
type VeoImageInput struct {
BytesBase64Encoded string `json:"bytesBase64Encoded"`
MimeType string `json:"mimeType"`
}
// GeminiVideoPayload is the top-level request body for the Gemini API
// models/{model}:generateVideos endpoint.
type GeminiVideoPayload struct {
Model string `json:"model,omitempty"`
Prompt string `json:"prompt"`
Image *VeoImageInput `json:"image,omitempty"`
Config *GeminiVideoGenerationConfig `json:"config,omitempty"`
// TODO: support referenceImages (style/asset references, up to 3 images)
// TODO: support lastFrame (first+last frame interpolation, Veo 3.1)
}
type submitResponse struct {
Name string `json:"name"`
}
type operationVideo struct {
MimeType string `json:"mimeType"`
BytesBase64Encoded string `json:"bytesBase64Encoded"`
Encoding string `json:"encoding"`
}
type operationResponse struct {
Name string `json:"name"`
Done bool `json:"done"`
Response struct {
Type string `json:"@type"`
RaiMediaFilteredCount int `json:"raiMediaFilteredCount"`
Videos []operationVideo `json:"videos"`
BytesBase64Encoded string `json:"bytesBase64Encoded"`
Encoding string `json:"encoding"`
Video string `json:"video"`
GenerateVideoResponse struct {
GeneratedVideos []struct {
Video struct {
URI string `json:"uri"`
} `json:"video"`
} `json:"generatedVideos"`
} `json:"generateVideoResponse"`
} `json:"response"`
Error struct {
Message string `json:"message"`
} `json:"error"`
}

View File

@@ -0,0 +1,100 @@
package gemini
import (
"encoding/base64"
"io"
"net/http"
"strings"
"github.com/QuantumNous/new-api/constant"
relaycommon "github.com/QuantumNous/new-api/relay/common"
"github.com/gin-gonic/gin"
)
const maxVeoImageSize = 20 * 1024 * 1024 // 20 MB
// ExtractMultipartImage reads the first `input_reference` file from a multipart
// form upload and returns a VeoImageInput. Returns nil if no file is present.
func ExtractMultipartImage(c *gin.Context, info *relaycommon.RelayInfo) *VeoImageInput {
mf, err := c.MultipartForm()
if err != nil {
return nil
}
files, exists := mf.File["input_reference"]
if !exists || len(files) == 0 {
return nil
}
fh := files[0]
if fh.Size > maxVeoImageSize {
return nil
}
file, err := fh.Open()
if err != nil {
return nil
}
defer file.Close()
fileBytes, err := io.ReadAll(file)
if err != nil {
return nil
}
mimeType := fh.Header.Get("Content-Type")
if mimeType == "" || mimeType == "application/octet-stream" {
mimeType = http.DetectContentType(fileBytes)
}
info.Action = constant.TaskActionGenerate
return &VeoImageInput{
BytesBase64Encoded: base64.StdEncoding.EncodeToString(fileBytes),
MimeType: mimeType,
}
}
// ParseImageInput parses an image string (data URI or raw base64) into a
// VeoImageInput. Returns nil if the input is empty or invalid.
// TODO: support downloading HTTP URL images and converting to base64
func ParseImageInput(imageStr string) *VeoImageInput {
imageStr = strings.TrimSpace(imageStr)
if imageStr == "" {
return nil
}
if strings.HasPrefix(imageStr, "data:") {
return parseDataURI(imageStr)
}
raw, err := base64.StdEncoding.DecodeString(imageStr)
if err != nil {
return nil
}
return &VeoImageInput{
BytesBase64Encoded: imageStr,
MimeType: http.DetectContentType(raw),
}
}
func parseDataURI(uri string) *VeoImageInput {
// data:image/png;base64,iVBOR...
rest := uri[len("data:"):]
idx := strings.Index(rest, ",")
if idx < 0 {
return nil
}
meta := rest[:idx]
b64 := rest[idx+1:]
if b64 == "" {
return nil
}
mimeType := "application/octet-stream"
parts := strings.SplitN(meta, ";", 2)
if len(parts) >= 1 && parts[0] != "" {
mimeType = parts[0]
}
return &VeoImageInput{
BytesBase64Encoded: b64,
MimeType: mimeType,
}
}

View File

@@ -16,6 +16,7 @@ import (
"github.com/QuantumNous/new-api/constant"
"github.com/QuantumNous/new-api/dto"
"github.com/QuantumNous/new-api/relay/channel"
geminitask "github.com/QuantumNous/new-api/relay/channel/task/gemini"
taskcommon "github.com/QuantumNous/new-api/relay/channel/task/taskcommon"
vertexcore "github.com/QuantumNous/new-api/relay/channel/vertex"
relaycommon "github.com/QuantumNous/new-api/relay/common"
@@ -26,9 +27,34 @@ import (
// Request / Response structures
// ============================
type veoInstance struct {
Prompt string `json:"prompt"`
Image *geminitask.VeoImageInput `json:"image,omitempty"`
// TODO: support referenceImages (style/asset references, up to 3 images)
// TODO: support lastFrame (first+last frame interpolation, Veo 3.1)
}
type veoParameters struct {
SampleCount int `json:"sampleCount"`
DurationSeconds int `json:"durationSeconds,omitempty"`
AspectRatio string `json:"aspectRatio,omitempty"`
Resolution string `json:"resolution,omitempty"`
NegativePrompt string `json:"negativePrompt,omitempty"`
PersonGeneration string `json:"personGeneration,omitempty"`
StorageUri string `json:"storageUri,omitempty"`
CompressionQuality string `json:"compressionQuality,omitempty"`
ResizeMode string `json:"resizeMode,omitempty"`
Seed *int `json:"seed,omitempty"`
GenerateAudio *bool `json:"generateAudio,omitempty"`
}
type requestPayload struct {
Instances []map[string]any `json:"instances"`
Parameters map[string]any `json:"parameters,omitempty"`
Instances []veoInstance `json:"instances"`
Parameters *veoParameters `json:"parameters,omitempty"`
}
type fetchOperationPayload struct {
OperationName string `json:"operationName"`
}
type submitResponse struct {
@@ -134,25 +160,21 @@ func (a *TaskAdaptor) BuildRequestHeader(c *gin.Context, req *http.Request, info
return nil
}
// EstimateBilling 根据用户请求中的 sampleCount 计算 OtherRatios。
func (a *TaskAdaptor) EstimateBilling(c *gin.Context, _ *relaycommon.RelayInfo) map[string]float64 {
sampleCount := 1
// EstimateBilling returns OtherRatios based on durationSeconds and resolution.
func (a *TaskAdaptor) EstimateBilling(c *gin.Context, info *relaycommon.RelayInfo) map[string]float64 {
v, ok := c.Get("task_request")
if ok {
req := v.(relaycommon.TaskSubmitReq)
if req.Metadata != nil {
if sc, exists := req.Metadata["sampleCount"]; exists {
if i, ok := sc.(int); ok && i > 0 {
sampleCount = i
}
if f, ok := sc.(float64); ok && int(f) > 0 {
sampleCount = int(f)
}
}
}
if !ok {
return nil
}
req := v.(relaycommon.TaskSubmitReq)
seconds := geminitask.ResolveVeoDuration(req.Metadata, req.Duration, req.Seconds)
resolution := geminitask.ResolveVeoResolution(req.Metadata, req.Size)
resRatio := geminitask.VeoResolutionRatio(info.UpstreamModelName, resolution)
return map[string]float64{
"sampleCount": float64(sampleCount),
"seconds": float64(seconds),
"resolution": resRatio,
}
}
@@ -164,29 +186,35 @@ func (a *TaskAdaptor) BuildRequestBody(c *gin.Context, info *relaycommon.RelayIn
}
req := v.(relaycommon.TaskSubmitReq)
body := requestPayload{
Instances: []map[string]any{{"prompt": req.Prompt}},
Parameters: map[string]any{},
}
if req.Metadata != nil {
if v, ok := req.Metadata["storageUri"]; ok {
body.Parameters["storageUri"] = v
instance := veoInstance{Prompt: req.Prompt}
if img := geminitask.ExtractMultipartImage(c, info); img != nil {
instance.Image = img
} else if len(req.Images) > 0 {
if parsed := geminitask.ParseImageInput(req.Images[0]); parsed != nil {
instance.Image = parsed
info.Action = constant.TaskActionGenerate
}
if v, ok := req.Metadata["sampleCount"]; ok {
if i, ok := v.(int); ok {
body.Parameters["sampleCount"] = i
}
if f, ok := v.(float64); ok {
body.Parameters["sampleCount"] = int(f)
}
}
}
if _, ok := body.Parameters["sampleCount"]; !ok {
body.Parameters["sampleCount"] = 1
}
if body.Parameters["sampleCount"].(int) <= 0 {
return nil, fmt.Errorf("sampleCount must be greater than 0")
params := &veoParameters{}
if err := taskcommon.UnmarshalMetadata(req.Metadata, params); err != nil {
return nil, fmt.Errorf("unmarshal metadata failed: %w", err)
}
if params.DurationSeconds == 0 && req.Duration > 0 {
params.DurationSeconds = req.Duration
}
if params.Resolution == "" && req.Size != "" {
params.Resolution = geminitask.SizeToVeoResolution(req.Size)
}
if params.AspectRatio == "" && req.Size != "" {
params.AspectRatio = geminitask.SizeToVeoAspectRatio(req.Size)
}
params.Resolution = strings.ToLower(params.Resolution)
params.SampleCount = 1
body := requestPayload{
Instances: []veoInstance{instance},
Parameters: params,
}
data, err := common.Marshal(body)
@@ -226,7 +254,14 @@ func (a *TaskAdaptor) DoResponse(c *gin.Context, resp *http.Response, info *rela
return localID, responseBody, nil
}
func (a *TaskAdaptor) GetModelList() []string { return []string{"veo-3.0-generate-001"} }
func (a *TaskAdaptor) GetModelList() []string {
return []string{
"veo-3.0-generate-001",
"veo-3.0-fast-generate-001",
"veo-3.1-generate-preview",
"veo-3.1-fast-generate-preview",
}
}
func (a *TaskAdaptor) GetChannelName() string { return "vertex" }
// FetchTask fetch task status
@@ -254,7 +289,7 @@ func (a *TaskAdaptor) FetchTask(baseUrl, key string, body map[string]any, proxy
} else {
url = fmt.Sprintf("https://%s-aiplatform.googleapis.com/v1/projects/%s/locations/%s/publishers/google/models/%s:fetchPredictOperation", region, project, region, modelName)
}
payload := map[string]string{"operationName": upstreamName}
payload := fetchOperationPayload{OperationName: upstreamName}
data, err := common.Marshal(payload)
if err != nil {
return nil, err