fix: skip image understanding for vision models (#1747)

Thanks @tyler6204. Co-authored-by: Tyler Yust <64381258+tyler6204@users.noreply.github.com>
2026-05-10 08:32:43 +00:00 · 2026-01-25 09:56:57 +00:00
parent fdecf5c59a
commit 5f9863098b
3 changed files with 98 additions and 20 deletions
--- a/src/media-understanding/runner.vision-skip.test.ts
+++ b/src/media-understanding/runner.vision-skip.test.ts
@@ -0,0 +1,61 @@
+import { describe, expect, it, vi } from "vitest";
+
+import type { MsgContext } from "../auto-reply/templating.js";
+import type { ClawdbotConfig } from "../config/config.js";
+import {
+  buildProviderRegistry,
+  createMediaAttachmentCache,
+  normalizeMediaAttachments,
+  runCapability,
+} from "./runner.js";
+
+const catalog = [
+  {
+    id: "gpt-4.1",
+    name: "GPT-4.1",
+    provider: "openai",
+    input: ["text", "image"] as const,
+  },
+];
+
+vi.mock("../agents/model-catalog.js", async () => {
+  const actual = await vi.importActual<typeof import("../agents/model-catalog.js")>(
+    "../agents/model-catalog.js",
+  );
+  return {
+    ...actual,
+    loadModelCatalog: vi.fn(async () => catalog),
+  };
+});
+
+describe("runCapability image skip", () => {
+  it("skips image understanding when the active model supports vision", async () => {
+    const ctx: MsgContext = { MediaPath: "/tmp/image.png", MediaType: "image/png" };
+    const media = normalizeMediaAttachments(ctx);
+    const cache = createMediaAttachmentCache(media);
+    const cfg = {} as ClawdbotConfig;
+
+    try {
+      const result = await runCapability({
+        capability: "image",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry: buildProviderRegistry(),
+        activeModel: { provider: "openai", model: "gpt-4.1" },
+      });
+
+      expect(result.outputs).toHaveLength(0);
+      expect(result.decision.outcome).toBe("skipped");
+      expect(result.decision.attachments).toHaveLength(1);
+      expect(result.decision.attachments[0]?.attachmentIndex).toBe(0);
+      expect(result.decision.attachments[0]?.attempts[0]?.outcome).toBe("skipped");
+      expect(result.decision.attachments[0]?.attempts[0]?.reason).toBe(
+        "primary model supports vision natively",
+      );
+    } finally {
+      await cache.cleanup();
+    }
+  });
+});