feat: gpt->claude格式转换支持图片识别

This commit is contained in:
shaw
2026-03-08 23:16:58 +08:00
parent a2ae9f1f27
commit 00c151b463
3 changed files with 276 additions and 24 deletions

View File

@@ -733,3 +733,188 @@ func TestAnthropicToResponses_ToolChoiceSpecific(t *testing.T) {
require.True(t, ok)
assert.Equal(t, "get_weather", fn["name"])
}
// ---------------------------------------------------------------------------
// Image content block conversion tests
// ---------------------------------------------------------------------------
func TestAnthropicToResponses_UserImageBlock(t *testing.T) {
req := &AnthropicRequest{
Model: "gpt-5.2",
MaxTokens: 1024,
Messages: []AnthropicMessage{
{Role: "user", Content: json.RawMessage(`[
{"type":"text","text":"What is in this image?"},
{"type":"image","source":{"type":"base64","media_type":"image/png","data":"iVBOR"}}
]`)},
},
}
resp, err := AnthropicToResponses(req)
require.NoError(t, err)
var items []ResponsesInputItem
require.NoError(t, json.Unmarshal(resp.Input, &items))
require.Len(t, items, 1)
assert.Equal(t, "user", items[0].Role)
var parts []ResponsesContentPart
require.NoError(t, json.Unmarshal(items[0].Content, &parts))
require.Len(t, parts, 2)
assert.Equal(t, "input_text", parts[0].Type)
assert.Equal(t, "What is in this image?", parts[0].Text)
assert.Equal(t, "input_image", parts[1].Type)
assert.Equal(t, "data:image/png;base64,iVBOR", parts[1].ImageURL)
}
func TestAnthropicToResponses_ImageOnlyUserMessage(t *testing.T) {
req := &AnthropicRequest{
Model: "gpt-5.2",
MaxTokens: 1024,
Messages: []AnthropicMessage{
{Role: "user", Content: json.RawMessage(`[
{"type":"image","source":{"type":"base64","media_type":"image/jpeg","data":"/9j/4AAQ"}}
]`)},
},
}
resp, err := AnthropicToResponses(req)
require.NoError(t, err)
var items []ResponsesInputItem
require.NoError(t, json.Unmarshal(resp.Input, &items))
require.Len(t, items, 1)
var parts []ResponsesContentPart
require.NoError(t, json.Unmarshal(items[0].Content, &parts))
require.Len(t, parts, 1)
assert.Equal(t, "input_image", parts[0].Type)
assert.Equal(t, "data:image/jpeg;base64,/9j/4AAQ", parts[0].ImageURL)
}
func TestAnthropicToResponses_ToolResultWithImage(t *testing.T) {
req := &AnthropicRequest{
Model: "gpt-5.2",
MaxTokens: 1024,
Messages: []AnthropicMessage{
{Role: "user", Content: json.RawMessage(`"Read the screenshot"`)},
{Role: "assistant", Content: json.RawMessage(`[{"type":"tool_use","id":"toolu_1","name":"Read","input":{"file_path":"/tmp/screen.png"}}]`)},
{Role: "user", Content: json.RawMessage(`[
{"type":"tool_result","tool_use_id":"toolu_1","content":[
{"type":"image","source":{"type":"base64","media_type":"image/png","data":"iVBOR"}}
]}
]`)},
},
}
resp, err := AnthropicToResponses(req)
require.NoError(t, err)
var items []ResponsesInputItem
require.NoError(t, json.Unmarshal(resp.Input, &items))
// user + function_call + function_call_output + user(image) = 4
require.Len(t, items, 4)
// function_call_output should have text-only output (no image).
assert.Equal(t, "function_call_output", items[2].Type)
assert.Equal(t, "fc_toolu_1", items[2].CallID)
assert.Equal(t, "(empty)", items[2].Output)
// Image should be in a separate user message.
assert.Equal(t, "user", items[3].Role)
var parts []ResponsesContentPart
require.NoError(t, json.Unmarshal(items[3].Content, &parts))
require.Len(t, parts, 1)
assert.Equal(t, "input_image", parts[0].Type)
assert.Equal(t, "data:image/png;base64,iVBOR", parts[0].ImageURL)
}
func TestAnthropicToResponses_ToolResultMixed(t *testing.T) {
req := &AnthropicRequest{
Model: "gpt-5.2",
MaxTokens: 1024,
Messages: []AnthropicMessage{
{Role: "user", Content: json.RawMessage(`"Describe the file"`)},
{Role: "assistant", Content: json.RawMessage(`[{"type":"tool_use","id":"toolu_2","name":"Read","input":{"file_path":"/tmp/photo.png"}}]`)},
{Role: "user", Content: json.RawMessage(`[
{"type":"tool_result","tool_use_id":"toolu_2","content":[
{"type":"text","text":"File metadata: 800x600 PNG"},
{"type":"image","source":{"type":"base64","media_type":"image/png","data":"AAAA"}}
]}
]`)},
},
}
resp, err := AnthropicToResponses(req)
require.NoError(t, err)
var items []ResponsesInputItem
require.NoError(t, json.Unmarshal(resp.Input, &items))
// user + function_call + function_call_output + user(image) = 4
require.Len(t, items, 4)
// function_call_output should have text-only output.
assert.Equal(t, "function_call_output", items[2].Type)
assert.Equal(t, "File metadata: 800x600 PNG", items[2].Output)
// Image should be in a separate user message.
assert.Equal(t, "user", items[3].Role)
var parts []ResponsesContentPart
require.NoError(t, json.Unmarshal(items[3].Content, &parts))
require.Len(t, parts, 1)
assert.Equal(t, "input_image", parts[0].Type)
assert.Equal(t, "data:image/png;base64,AAAA", parts[0].ImageURL)
}
func TestAnthropicToResponses_TextOnlyToolResultBackwardCompat(t *testing.T) {
req := &AnthropicRequest{
Model: "gpt-5.2",
MaxTokens: 1024,
Messages: []AnthropicMessage{
{Role: "user", Content: json.RawMessage(`"Check weather"`)},
{Role: "assistant", Content: json.RawMessage(`[{"type":"tool_use","id":"call_1","name":"get_weather","input":{"city":"NYC"}}]`)},
{Role: "user", Content: json.RawMessage(`[
{"type":"tool_result","tool_use_id":"call_1","content":[
{"type":"text","text":"Sunny, 72°F"}
]}
]`)},
},
}
resp, err := AnthropicToResponses(req)
require.NoError(t, err)
var items []ResponsesInputItem
require.NoError(t, json.Unmarshal(resp.Input, &items))
// user + function_call + function_call_output = 3
require.Len(t, items, 3)
// Text-only tool_result should produce a plain string.
assert.Equal(t, "Sunny, 72°F", items[2].Output)
}
func TestAnthropicToResponses_ImageEmptyMediaType(t *testing.T) {
req := &AnthropicRequest{
Model: "gpt-5.2",
MaxTokens: 1024,
Messages: []AnthropicMessage{
{Role: "user", Content: json.RawMessage(`[
{"type":"image","source":{"type":"base64","media_type":"","data":"iVBOR"}}
]`)},
},
}
resp, err := AnthropicToResponses(req)
require.NoError(t, err)
var items []ResponsesInputItem
require.NoError(t, json.Unmarshal(resp.Input, &items))
require.Len(t, items, 1)
var parts []ResponsesContentPart
require.NoError(t, json.Unmarshal(items[0].Content, &parts))
require.Len(t, parts, 1)
assert.Equal(t, "input_image", parts[0].Type)
// Should default to image/png when media_type is empty.
assert.Equal(t, "data:image/png;base64,iVBOR", parts[0].ImageURL)
}

View File

@@ -169,7 +169,7 @@ func anthropicMsgToResponsesItems(m AnthropicMessage) ([]ResponsesInputItem, err
// anthropicUserToResponses handles an Anthropic user message. Content can be a
// plain string or an array of blocks. tool_result blocks are extracted into
// function_call_output items.
// function_call_output items. Image blocks are converted to input_image parts.
func anthropicUserToResponses(raw json.RawMessage) ([]ResponsesInputItem, error) {
// Try plain string.
var s string
@@ -184,28 +184,46 @@ func anthropicUserToResponses(raw json.RawMessage) ([]ResponsesInputItem, error)
}
var out []ResponsesInputItem
var toolResultImageParts []ResponsesContentPart
// Extract tool_result blocks → function_call_output items.
// Images inside tool_results are extracted separately because the
// Responses API function_call_output.output only accepts strings.
for _, b := range blocks {
if b.Type != "tool_result" {
continue
}
text := extractAnthropicToolResultText(b)
if text == "" {
// OpenAI Responses API requires "output" field; use placeholder for empty results.
text = "(empty)"
}
outputText, imageParts := convertToolResultOutput(b)
out = append(out, ResponsesInputItem{
Type: "function_call_output",
CallID: toResponsesCallID(b.ToolUseID),
Output: text,
Output: outputText,
})
toolResultImageParts = append(toolResultImageParts, imageParts...)
}
// Remaining text blocks → user message.
text := extractAnthropicTextFromBlocks(blocks)
if text != "" {
content, _ := json.Marshal(text)
// Remaining text + image blocks → user message with content parts.
// Also include images extracted from tool_results so the model can see them.
var parts []ResponsesContentPart
for _, b := range blocks {
switch b.Type {
case "text":
if b.Text != "" {
parts = append(parts, ResponsesContentPart{Type: "input_text", Text: b.Text})
}
case "image":
if uri := anthropicImageToDataURI(b.Source); uri != "" {
parts = append(parts, ResponsesContentPart{Type: "input_image", ImageURL: uri})
}
}
}
parts = append(parts, toolResultImageParts...)
if len(parts) > 0 {
content, err := json.Marshal(parts)
if err != nil {
return nil, err
}
out = append(out, ResponsesInputItem{Role: "user", Content: content})
}
@@ -290,26 +308,64 @@ func fromResponsesCallID(id string) string {
return id
}
// extractAnthropicToolResultText gets the text content from a tool_result block.
func extractAnthropicToolResultText(b AnthropicContentBlock) string {
if len(b.Content) == 0 {
// anthropicImageToDataURI converts an AnthropicImageSource to a data URI string.
// Returns "" if the source is nil or has no data.
func anthropicImageToDataURI(src *AnthropicImageSource) string {
if src == nil || src.Data == "" {
return ""
}
mediaType := src.MediaType
if mediaType == "" {
mediaType = "image/png"
}
return "data:" + mediaType + ";base64," + src.Data
}
// convertToolResultOutput extracts text and image content from a tool_result
// block. Returns the text as a string for the function_call_output Output
// field, plus any image parts that must be sent in a separate user message
// (the Responses API output field only accepts strings).
func convertToolResultOutput(b AnthropicContentBlock) (string, []ResponsesContentPart) {
if len(b.Content) == 0 {
return "(empty)", nil
}
// Try plain string content.
var s string
if err := json.Unmarshal(b.Content, &s); err == nil {
return s
if s == "" {
s = "(empty)"
}
return s, nil
}
// Array of content blocks — may contain text and/or images.
var inner []AnthropicContentBlock
if err := json.Unmarshal(b.Content, &inner); err == nil {
var parts []string
for _, ib := range inner {
if ib.Type == "text" && ib.Text != "" {
parts = append(parts, ib.Text)
if err := json.Unmarshal(b.Content, &inner); err != nil {
return "(empty)", nil
}
// Separate text (for function_call_output) from images (for user message).
var textParts []string
var imageParts []ResponsesContentPart
for _, ib := range inner {
switch ib.Type {
case "text":
if ib.Text != "" {
textParts = append(textParts, ib.Text)
}
case "image":
if uri := anthropicImageToDataURI(ib.Source); uri != "" {
imageParts = append(imageParts, ResponsesContentPart{Type: "input_image", ImageURL: uri})
}
}
return strings.Join(parts, "\n\n")
}
return ""
text := strings.Join(textParts, "\n\n")
if text == "" {
text = "(empty)"
}
return text, imageParts
}
// extractAnthropicTextFromBlocks joins all text blocks, ignoring thinking/

View File

@@ -47,6 +47,9 @@ type AnthropicContentBlock struct {
// type=thinking
Thinking string `json:"thinking,omitempty"`
// type=image
Source *AnthropicImageSource `json:"source,omitempty"`
// type=tool_use
ID string `json:"id,omitempty"`
Name string `json:"name,omitempty"`
@@ -58,6 +61,13 @@ type AnthropicContentBlock struct {
IsError bool `json:"is_error,omitempty"`
}
// AnthropicImageSource describes the source data for an image content block.
type AnthropicImageSource struct {
Type string `json:"type"` // "base64"
MediaType string `json:"media_type"`
Data string `json:"data"`
}
// AnthropicTool describes a tool available to the model.
type AnthropicTool struct {
Type string `json:"type,omitempty"` // e.g. "web_search_20250305" for server tools
@@ -176,8 +186,9 @@ type ResponsesInputItem struct {
// ResponsesContentPart is a typed content part in a Responses message.
type ResponsesContentPart struct {
Type string `json:"type"` // "input_text" | "output_text" | "input_image"
Text string `json:"text,omitempty"`
Type string `json:"type"` // "input_text" | "output_text" | "input_image"
Text string `json:"text,omitempty"`
ImageURL string `json:"image_url,omitempty"` // data URI for input_image
}
// ResponsesTool describes a tool in the Responses API.