security(web_fetch): strip hidden content to prevent indirect prompt injection (#21074)

* security(web_fetch): strip hidden content to prevent indirect prompt injection

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

* security(web_fetch): address review feedback and credit author

* chore(changelog): credit reporter for web_fetch security fix

---------

Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
Robin Waslander
2026-02-23 00:10:26 +01:00
committed by GitHub
parent 73e5bb7635
commit 44727dc3a1
4 changed files with 416 additions and 8 deletions

View File

@@ -1,3 +1,5 @@
import { sanitizeHtml, stripInvisibleUnicode } from "./web-fetch-visibility.js";
export type ExtractMode = "markdown" | "text";
const READABILITY_MAX_HTML_CHARS = 1_000_000;
@@ -209,23 +211,26 @@ export async function extractReadableContent(params: {
url: string;
extractMode: ExtractMode;
}): Promise<{ text: string; title?: string } | null> {
const cleanHtml = await sanitizeHtml(params.html);
const fallback = (): { text: string; title?: string } => {
const rendered = htmlToMarkdown(params.html);
const rendered = htmlToMarkdown(cleanHtml);
if (params.extractMode === "text") {
const text = markdownToText(rendered.text) || normalizeWhitespace(stripTags(params.html));
const text =
stripInvisibleUnicode(markdownToText(rendered.text)) ||
stripInvisibleUnicode(normalizeWhitespace(stripTags(cleanHtml)));
return { text, title: rendered.title };
}
return rendered;
return { text: stripInvisibleUnicode(rendered.text), title: rendered.title };
};
if (
params.html.length > READABILITY_MAX_HTML_CHARS ||
exceedsEstimatedHtmlNestingDepth(params.html, READABILITY_MAX_ESTIMATED_NESTING_DEPTH)
cleanHtml.length > READABILITY_MAX_HTML_CHARS ||
exceedsEstimatedHtmlNestingDepth(cleanHtml, READABILITY_MAX_ESTIMATED_NESTING_DEPTH)
) {
return fallback();
}
try {
const { Readability, parseHTML } = await loadReadabilityDeps();
const { document } = parseHTML(params.html);
const { document } = parseHTML(cleanHtml);
try {
(document as { baseURI?: string }).baseURI = params.url;
} catch {
@@ -238,11 +243,11 @@ export async function extractReadableContent(params: {
}
const title = parsed.title || undefined;
if (params.extractMode === "text") {
const text = normalizeWhitespace(parsed.textContent ?? "");
const text = stripInvisibleUnicode(normalizeWhitespace(parsed.textContent ?? ""));
return text ? { text, title } : fallback();
}
const rendered = htmlToMarkdown(parsed.content);
return { text: rendered.text, title: title ?? rendered.title };
return { text: stripInvisibleUnicode(rendered.text), title: title ?? rendered.title };
} catch {
return fallback();
}