Media: preserve PDF MIME classification in file extraction

This commit is contained in:
Vignesh Natarajan
2026-02-21 20:50:17 -08:00
parent 63b4c500d9
commit 6d11b46994
3 changed files with 38 additions and 1 deletions

View File

@@ -632,6 +632,38 @@ describe("applyMediaUnderstanding", () => {
expect(ctx.Body).not.toContain("<file");
});
it("does not reclassify PDF attachments as text/plain", async () => {
const pseudoPdf = Buffer.from("%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\nendobj\n", "utf8");
const filePath = await createTempMediaFile({
fileName: "report.pdf",
content: pseudoPdf,
});
const cfg: OpenClawConfig = {
...createMediaDisabledConfig(),
gateway: {
http: {
endpoints: {
responses: {
files: { allowedMimes: ["text/plain"] },
},
},
},
},
};
const { ctx, result } = await applyWithDisabledMedia({
body: "<media:file>",
mediaPath: filePath,
mediaType: "application/pdf",
cfg,
});
expect(result.appliedFile).toBe(false);
expect(ctx.Body).toBe("<media:file>");
expect(ctx.Body).not.toContain("<file");
});
it("respects configured allowedMimes for text-like attachments", async () => {
const tsvText = "a\tb\tc\n1\t2\t3";
const tsvPath = await createTempMediaFile({

View File

@@ -382,7 +382,11 @@ async function extractFileBlocks(params: {
}
const utf16Charset = resolveUtf16Charset(bufferResult?.buffer);
const textSample = decodeTextSample(bufferResult?.buffer);
const textLike = Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer);
// Do not coerce real PDFs into text/plain via printable-byte heuristics.
// PDFs have a dedicated extraction path in extractFileContentFromSource.
const allowTextHeuristic = normalizedRawMime !== "application/pdf";
const textLike =
allowTextHeuristic && (Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer));
const guessedDelimited = textLike ? guessDelimitedMime(textSample) : undefined;
const textHint =
forcedTextMimeResolved ?? guessedDelimited ?? (textLike ? "text/plain" : undefined);