Media: preserve PDF MIME classification in file extraction

2026-05-09 01:38:26 +00:00 · 2026-02-21 20:50:17 -08:00
parent 63b4c500d9
commit 6d11b46994
3 changed files with 38 additions and 1 deletions
--- a/src/media-understanding/apply.e2e.test.ts
+++ b/src/media-understanding/apply.e2e.test.ts
@@ -632,6 +632,38 @@ describe("applyMediaUnderstanding", () => {
    expect(ctx.Body).not.toContain("<file");
  });

+  it("does not reclassify PDF attachments as text/plain", async () => {
+    const pseudoPdf = Buffer.from("%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\nendobj\n", "utf8");
+    const filePath = await createTempMediaFile({
+      fileName: "report.pdf",
+      content: pseudoPdf,
+    });
+
+    const cfg: OpenClawConfig = {
+      ...createMediaDisabledConfig(),
+      gateway: {
+        http: {
+          endpoints: {
+            responses: {
+              files: { allowedMimes: ["text/plain"] },
+            },
+          },
+        },
+      },
+    };
+
+    const { ctx, result } = await applyWithDisabledMedia({
+      body: "<media:file>",
+      mediaPath: filePath,
+      mediaType: "application/pdf",
+      cfg,
+    });
+
+    expect(result.appliedFile).toBe(false);
+    expect(ctx.Body).toBe("<media:file>");
+    expect(ctx.Body).not.toContain("<file");
+  });
+
  it("respects configured allowedMimes for text-like attachments", async () => {
    const tsvText = "a\tb\tc\n1\t2\t3";
    const tsvPath = await createTempMediaFile({
--- a/src/media-understanding/apply.ts
+++ b/src/media-understanding/apply.ts
@@ -382,7 +382,11 @@ async function extractFileBlocks(params: {
    }
    const utf16Charset = resolveUtf16Charset(bufferResult?.buffer);
    const textSample = decodeTextSample(bufferResult?.buffer);
-    const textLike = Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer);
+    // Do not coerce real PDFs into text/plain via printable-byte heuristics.
+    // PDFs have a dedicated extraction path in extractFileContentFromSource.
+    const allowTextHeuristic = normalizedRawMime !== "application/pdf";
+    const textLike =
+      allowTextHeuristic && (Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer));
    const guessedDelimited = textLike ? guessDelimitedMime(textSample) : undefined;
    const textHint =
      forcedTextMimeResolved ?? guessedDelimited ?? (textLike ? "text/plain" : undefined);