feat: native image injection for vision-capable models

- Auto-detect and load images referenced in user prompts
- Inject history images at their original message positions
- Fix EXIF orientation - rotate before resizing in resizeToJpeg
- Sandbox security: validate paths, block remote URLs when sandbox enabled
- Prevent duplicate history image injection across turns
- Handle string-based user message content (convert to array)
- Add bounds check for message index in history processing
- Fix regex to properly match relative paths (./  ../)
- Add multi-image support for iMessage attachments
- Pass MAX_IMAGE_BYTES limit to image loading

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Tyler Yust
2026-01-17 03:10:10 -08:00
committed by Peter Steinberger
parent f7123ec30a
commit 8d74578ceb
9 changed files with 892 additions and 16 deletions

View File

@@ -145,11 +145,10 @@ export function resolveImageModelConfigForTool(params: {
cfg?: ClawdbotConfig;
agentDir: string;
}): ImageModelConfig | null {
const primarySupportsImages = resolvePrimaryModelSupportsImages({
cfg: params.cfg,
agentDir: params.agentDir,
});
if (primarySupportsImages === true) return null;
// Note: We intentionally do NOT gate based on primarySupportsImages here.
// Even when the primary model supports images, we keep the tool available
// because images are auto-injected into prompts (see attempt.ts detectAndLoadPromptImages).
// The tool description is adjusted via modelHasVision to discourage redundant usage.
const explicit = coerceImageModelConfig(params.cfg);
if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) {
return explicit;
@@ -368,6 +367,8 @@ export function createImageTool(options?: {
config?: ClawdbotConfig;
agentDir?: string;
sandboxRoot?: string;
/** If true, the model has native vision capability and images in the prompt are auto-injected */
modelHasVision?: boolean;
}): AnyAgentTool | null {
const agentDir = options?.agentDir?.trim();
if (!agentDir) {
@@ -382,11 +383,17 @@ export function createImageTool(options?: {
agentDir,
});
if (!imageModelConfig) return null;
// If model has native vision, images in the prompt are auto-injected
// so this tool is only needed when image wasn't provided in the prompt
const description = options?.modelHasVision
? "Analyze an image with a vision model. Only use this tool when the image was NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you."
: "Analyze an image with the configured image model (agents.defaults.imageModel). Provide a prompt and image path or URL.";
return {
label: "Image",
name: "image",
description:
"Analyze an image with the configured image model (agents.defaults.imageModel). Provide a prompt and image path or URL.",
description,
parameters: Type.Object({
prompt: Type.Optional(Type.String()),
image: Type.String(),