feat: native image injection for vision-capable models

- Auto-detect and load images referenced in user prompts - Inject history images at their original message positions - Fix EXIF orientation - rotate before resizing in resizeToJpeg - Sandbox security: validate paths, block remote URLs when sandbox enabled - Prevent duplicate history image injection across turns - Handle string-based user message content (convert to array) - Add bounds check for message index in history processing - Fix regex to properly match relative paths (./ ../) - Add multi-image support for iMessage attachments - Pass MAX_IMAGE_BYTES limit to image loading Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-09 09:47:40 +00:00 · 2026-01-17 03:10:10 -08:00
parent f7123ec30a
commit 8d74578ceb
9 changed files with 892 additions and 16 deletions
--- a/src/agents/tools/image-tool.ts
+++ b/src/agents/tools/image-tool.ts
@@ -145,11 +145,10 @@ export function resolveImageModelConfigForTool(params: {
  cfg?: ClawdbotConfig;
  agentDir: string;
 }): ImageModelConfig | null {
-  const primarySupportsImages = resolvePrimaryModelSupportsImages({
-    cfg: params.cfg,
-    agentDir: params.agentDir,
-  });
-  if (primarySupportsImages === true) return null;
+  // Note: We intentionally do NOT gate based on primarySupportsImages here.
+  // Even when the primary model supports images, we keep the tool available
+  // because images are auto-injected into prompts (see attempt.ts detectAndLoadPromptImages).
+  // The tool description is adjusted via modelHasVision to discourage redundant usage.
  const explicit = coerceImageModelConfig(params.cfg);
  if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) {
    return explicit;
@@ -368,6 +367,8 @@ export function createImageTool(options?: {
  config?: ClawdbotConfig;
  agentDir?: string;
  sandboxRoot?: string;
+  /** If true, the model has native vision capability and images in the prompt are auto-injected */
+  modelHasVision?: boolean;
 }): AnyAgentTool | null {
  const agentDir = options?.agentDir?.trim();
  if (!agentDir) {
@@ -382,11 +383,17 @@ export function createImageTool(options?: {
    agentDir,
  });
  if (!imageModelConfig) return null;
+
+  // If model has native vision, images in the prompt are auto-injected
+  // so this tool is only needed when image wasn't provided in the prompt
+  const description = options?.modelHasVision
+    ? "Analyze an image with a vision model. Only use this tool when the image was NOT already provided in the user's message. Images mentioned in the prompt are automatically visible to you."
+    : "Analyze an image with the configured image model (agents.defaults.imageModel). Provide a prompt and image path or URL.";
+
  return {
    label: "Image",
    name: "image",
-    description:
-      "Analyze an image with the configured image model (agents.defaults.imageModel). Provide a prompt and image path or URL.",
+    description,
    parameters: Type.Object({
      prompt: Type.Optional(Type.String()),
      image: Type.String(),