feat: native image injection for vision-capable models

- Auto-detect and load images referenced in user prompts - Inject history images at their original message positions - Fix EXIF orientation - rotate before resizing in resizeToJpeg - Sandbox security: validate paths, block remote URLs when sandbox enabled - Prevent duplicate history image injection across turns - Handle string-based user message content (convert to array) - Add bounds check for message index in history processing - Fix regex to properly match relative paths (./ ../) - Add multi-image support for iMessage attachments - Pass MAX_IMAGE_BYTES limit to image loading Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-09 11:57:39 +00:00 · 2026-01-17 03:10:10 -08:00
parent f7123ec30a
commit 8d74578ceb
9 changed files with 892 additions and 16 deletions
--- a/src/agents/pi-embedded-runner/run/attempt.ts
+++ b/src/agents/pi-embedded-runner/run/attempt.ts
@@ -2,7 +2,7 @@ import fs from "node:fs/promises";
 import os from "node:os";

 import type { AgentMessage } from "@mariozechner/pi-agent-core";
-import type { AssistantMessage } from "@mariozechner/pi-ai";
+import type { AssistantMessage, ImageContent } from "@mariozechner/pi-ai";
 import { streamSimple } from "@mariozechner/pi-ai";
 import { createAgentSession, SessionManager, SettingsManager } from "@mariozechner/pi-coding-agent";

@@ -69,7 +69,9 @@ import { resolveSandboxRuntimeStatus } from "../../sandbox/runtime-status.js";
 import { isTimeoutError } from "../../failover-error.js";
 import { getGlobalHookRunner } from "../../../plugins/hook-runner-global.js";

+import { MAX_IMAGE_BYTES } from "../../../media/constants.js";
 import type { EmbeddedRunAttemptParams, EmbeddedRunAttemptResult } from "./types.js";
+import { detectAndLoadPromptImages } from "./images.js";

 export async function runEmbeddedAttempt(
  params: EmbeddedRunAttemptParams,
@@ -133,6 +135,9 @@ export async function runEmbeddedAttempt(

    const agentDir = params.agentDir ?? resolveClawdbotAgentDir();

+    // Check if the model supports native image input
+    const modelHasVision = params.model.input?.includes("image") ?? false;
+
    const toolsRaw = createClawdbotCodingTools({
      exec: {
        ...params.execOverrides,
@@ -153,6 +158,7 @@ export async function runEmbeddedAttempt(
      currentThreadTs: params.currentThreadTs,
      replyToMode: params.replyToMode,
      hasRepliedRef: params.hasRepliedRef,
+      modelHasVision,
    });
    const tools = sanitizeToolsForGoogle({ tools: toolsRaw, provider: params.provider });
    logToolSchemasForGoogle({ tools, provider: params.provider });
@@ -530,7 +536,60 @@ export async function runEmbeddedAttempt(
        }

        try {
-          await abortable(activeSession.prompt(effectivePrompt, { images: params.images }));
+          // Detect and load images referenced in the prompt for vision-capable models.
+          // This eliminates the need for an explicit "view" tool call by injecting
+          // images directly into the prompt when the model supports it.
+          // Also scans conversation history to enable follow-up questions about earlier images.
+          const imageResult = await detectAndLoadPromptImages({
+            prompt: effectivePrompt,
+            workspaceDir: effectiveWorkspace,
+            model: params.model,
+            existingImages: params.images,
+            historyMessages: activeSession.messages,
+            maxBytes: MAX_IMAGE_BYTES,
+            // Enforce sandbox path restrictions when sandbox is enabled
+            sandboxRoot: sandbox?.enabled ? sandbox.workspaceDir : undefined,
+          });
+
+          // Inject history images into their original message positions.
+          // This ensures the model sees images in context (e.g., "compare to the first image").
+          if (imageResult.historyImagesByIndex.size > 0) {
+            for (const [msgIndex, images] of imageResult.historyImagesByIndex) {
+              // Bounds check: ensure index is valid before accessing
+              if (msgIndex < 0 || msgIndex >= activeSession.messages.length) continue;
+              const msg = activeSession.messages[msgIndex];
+              if (msg && msg.role === "user") {
+                // Convert string content to array format if needed
+                if (typeof msg.content === "string") {
+                  msg.content = [{ type: "text", text: msg.content }];
+                }
+                if (Array.isArray(msg.content)) {
+                  // Check for existing image content to avoid duplicates across turns
+                  const existingImageData = new Set(
+                    msg.content
+                      .filter((c): c is ImageContent =>
+                        c != null && typeof c === "object" && c.type === "image" && typeof c.data === "string",
+                      )
+                      .map((c) => c.data),
+                  );
+                  for (const img of images) {
+                    // Only add if this image isn't already in the message
+                    if (!existingImageData.has(img.data)) {
+                      msg.content.push(img);
+                    }
+                  }
+                }
+              }
+            }
+          }
+
+          // Only pass images option if there are actually images to pass
+          // This avoids potential issues with models that don't expect the images parameter
+          if (imageResult.images.length > 0) {
+            await abortable(activeSession.prompt(effectivePrompt, { images: imageResult.images }));
+          } else {
+            await abortable(activeSession.prompt(effectivePrompt));
+          }
        } catch (err) {
          promptError = err;
        } finally {