fix(auto-reply): allow image-only messages to reach the agent (openclaw#12352) thanks @arosstale

Verified: - pnpm build - pnpm check - pnpm test Co-authored-by: arosstale <117890364+arosstale@users.noreply.github.com> Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
2026-05-06 17:01:37 +00:00 · 2026-02-14 01:42:22 +01:00
parent e7c3c27fd0
commit 67b5c093b5
3 changed files with 204 additions and 3 deletions
--- a/src/auto-reply/reply/get-reply-run.media-only.test.ts
+++ b/src/auto-reply/reply/get-reply-run.media-only.test.ts
@@ -0,0 +1,192 @@
+import { beforeEach, describe, expect, it, vi } from "vitest";
+import { runPreparedReply } from "./get-reply-run.js";
+
+vi.mock("../../agents/auth-profiles/session-override.js", () => ({
+  resolveSessionAuthProfileOverride: vi.fn().mockResolvedValue(undefined),
+}));
+
+vi.mock("../../agents/pi-embedded.js", () => ({
+  abortEmbeddedPiRun: vi.fn().mockReturnValue(false),
+  isEmbeddedPiRunActive: vi.fn().mockReturnValue(false),
+  isEmbeddedPiRunStreaming: vi.fn().mockReturnValue(false),
+  resolveEmbeddedSessionLane: vi.fn().mockReturnValue("session:session-key"),
+}));
+
+vi.mock("../../config/sessions.js", () => ({
+  resolveGroupSessionKey: vi.fn().mockReturnValue(undefined),
+  resolveSessionFilePath: vi.fn().mockReturnValue("/tmp/session.jsonl"),
+  resolveSessionFilePathOptions: vi.fn().mockReturnValue({}),
+  updateSessionStore: vi.fn(),
+}));
+
+vi.mock("../../globals.js", () => ({
+  logVerbose: vi.fn(),
+}));
+
+vi.mock("../../process/command-queue.js", () => ({
+  clearCommandLane: vi.fn().mockReturnValue(0),
+  getQueueSize: vi.fn().mockReturnValue(0),
+}));
+
+vi.mock("../../routing/session-key.js", () => ({
+  normalizeMainKey: vi.fn().mockReturnValue("main"),
+}));
+
+vi.mock("../../utils/provider-utils.js", () => ({
+  isReasoningTagProvider: vi.fn().mockReturnValue(false),
+}));
+
+vi.mock("../command-detection.js", () => ({
+  hasControlCommand: vi.fn().mockReturnValue(false),
+}));
+
+vi.mock("./agent-runner.js", () => ({
+  runReplyAgent: vi.fn().mockResolvedValue({ text: "ok" }),
+}));
+
+vi.mock("./body.js", () => ({
+  applySessionHints: vi.fn().mockImplementation(async ({ baseBody }) => baseBody),
+}));
+
+vi.mock("./groups.js", () => ({
+  buildGroupIntro: vi.fn().mockReturnValue(""),
+}));
+
+vi.mock("./inbound-meta.js", () => ({
+  buildInboundMetaSystemPrompt: vi.fn().mockReturnValue(""),
+  buildInboundUserContextPrefix: vi.fn().mockReturnValue(""),
+}));
+
+vi.mock("./queue.js", () => ({
+  resolveQueueSettings: vi.fn().mockReturnValue({ mode: "followup" }),
+}));
+
+vi.mock("./route-reply.js", () => ({
+  routeReply: vi.fn(),
+}));
+
+vi.mock("./session-updates.js", () => ({
+  ensureSkillSnapshot: vi.fn().mockImplementation(async ({ sessionEntry, systemSent }) => ({
+    sessionEntry,
+    systemSent,
+    skillsSnapshot: undefined,
+  })),
+  prependSystemEvents: vi.fn().mockImplementation(async ({ prefixedBodyBase }) => prefixedBodyBase),
+}));
+
+vi.mock("./typing-mode.js", () => ({
+  resolveTypingMode: vi.fn().mockReturnValue("off"),
+}));
+
+import { runReplyAgent } from "./agent-runner.js";
+
+function baseParams(
+  overrides: Partial<Parameters<typeof runPreparedReply>[0]> = {},
+): Parameters<typeof runPreparedReply>[0] {
+  return {
+    ctx: {
+      Body: "",
+      RawBody: "",
+      CommandBody: "",
+      ThreadHistoryBody: "Earlier message in this thread",
+      OriginatingChannel: "slack",
+      OriginatingTo: "C123",
+      ChatType: "group",
+    },
+    sessionCtx: {
+      Body: "",
+      BodyStripped: "",
+      ThreadHistoryBody: "Earlier message in this thread",
+      MediaPath: "/tmp/input.png",
+      Provider: "slack",
+      ChatType: "group",
+      OriginatingChannel: "slack",
+      OriginatingTo: "C123",
+    },
+    cfg: { session: {}, channels: {}, agents: { defaults: {} } },
+    agentId: "default",
+    agentDir: "/tmp/agent",
+    agentCfg: {},
+    sessionCfg: {},
+    commandAuthorized: true,
+    command: {
+      isAuthorizedSender: true,
+      abortKey: "session-key",
+      ownerList: [],
+      senderIsOwner: false,
+    } as never,
+    commandSource: "",
+    allowTextCommands: true,
+    directives: {
+      hasThinkDirective: false,
+      thinkLevel: undefined,
+    } as never,
+    defaultActivation: "always",
+    resolvedThinkLevel: "high",
+    resolvedVerboseLevel: "off",
+    resolvedReasoningLevel: "off",
+    resolvedElevatedLevel: "off",
+    elevatedEnabled: false,
+    elevatedAllowed: false,
+    blockStreamingEnabled: false,
+    resolvedBlockStreamingBreak: "message_end",
+    modelState: {
+      resolveDefaultThinkingLevel: async () => "medium",
+    } as never,
+    provider: "anthropic",
+    model: "claude-opus-4-1",
+    typing: {
+      onReplyStart: vi.fn().mockResolvedValue(undefined),
+      cleanup: vi.fn(),
+    } as never,
+    defaultProvider: "anthropic",
+    defaultModel: "claude-opus-4-1",
+    timeoutMs: 30_000,
+    isNewSession: true,
+    resetTriggered: false,
+    systemSent: true,
+    sessionKey: "session-key",
+    workspaceDir: "/tmp/workspace",
+    abortedLastRun: false,
+    ...overrides,
+  };
+}
+
+describe("runPreparedReply media-only handling", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it("allows media-only prompts and preserves thread context in queued followups", async () => {
+    const result = await runPreparedReply(baseParams());
+    expect(result).toEqual({ text: "ok" });
+
+    const call = vi.mocked(runReplyAgent).mock.calls[0]?.[0];
+    expect(call).toBeTruthy();
+    expect(call?.followupRun.prompt).toContain("[Thread history - for context]");
+    expect(call?.followupRun.prompt).toContain("Earlier message in this thread");
+    expect(call?.followupRun.prompt).toContain("[User sent media without caption]");
+  });
+
+  it("returns the empty-body reply when there is no text and no media", async () => {
+    const result = await runPreparedReply(
+      baseParams({
+        ctx: {
+          Body: "",
+          RawBody: "",
+          CommandBody: "",
+        },
+        sessionCtx: {
+          Body: "",
+          BodyStripped: "",
+          Provider: "slack",
+        },
+      }),
+    );
+
+    expect(result).toEqual({
+      text: "I didn't receive any text in your message. Please resend or add a caption.",
+    });
+    expect(vi.mocked(runReplyAgent)).not.toHaveBeenCalled();
+  });
+});
--- a/src/auto-reply/reply/get-reply-run.ts
+++ b/src/auto-reply/reply/get-reply-run.ts
@@ -221,7 +221,10 @@ export async function runPreparedReply(
    ? baseBodyFinal
    : [inboundUserContext, baseBodyFinal].filter(Boolean).join("\n\n");
  const baseBodyTrimmed = baseBodyForPrompt.trim();
-  if (!baseBodyTrimmed) {
+  const hasMediaAttachment = Boolean(
+    sessionCtx.MediaPath || (sessionCtx.MediaPaths && sessionCtx.MediaPaths.length > 0),
+  );
+  if (!baseBodyTrimmed && !hasMediaAttachment) {
    await typing.onReplyStart();
    logVerbose("Inbound body empty after normalization; skipping agent run");
    typing.cleanup();
@@ -229,8 +232,13 @@ export async function runPreparedReply(
      text: "I didn't receive any text in your message. Please resend or add a caption.",
    };
  }
+  // When the user sends media without text, provide a minimal body so the agent
+  // run proceeds and the image/document is injected by the embedded runner.
+  const effectiveBaseBody = baseBodyTrimmed
+    ? baseBodyForPrompt
+    : "[User sent media without caption]";
  let prefixedBodyBase = await applySessionHints({
-    baseBody: baseBodyForPrompt,
+    baseBody: effectiveBaseBody,
    abortedLastRun,
    sessionEntry,
    sessionStore,
@@ -337,7 +345,7 @@ export async function runPreparedReply(
    sessionEntry,
    resolveSessionFilePathOptions({ agentId, storePath }),
  );
-  const queueBodyBase = [threadContextNote, baseBodyForPrompt].filter(Boolean).join("\n\n");
+  const queueBodyBase = [threadContextNote, effectiveBaseBody].filter(Boolean).join("\n\n");
  const queuedBody = mediaNote
    ? [mediaNote, mediaReplyHint, queueBodyBase].filter(Boolean).join("\n").trim()
    : queueBodyBase;