mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-10 12:54:58 +00:00
security(web_fetch): strip hidden content to prevent indirect prompt injection (#21074)
* security(web_fetch): strip hidden content to prevent indirect prompt injection Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> * security(web_fetch): address review feedback and credit author * chore(changelog): credit reporter for web_fetch security fix --------- Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com> Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
import { sanitizeHtml, stripInvisibleUnicode } from "./web-fetch-visibility.js";
|
||||
|
||||
export type ExtractMode = "markdown" | "text";
|
||||
|
||||
const READABILITY_MAX_HTML_CHARS = 1_000_000;
|
||||
@@ -209,23 +211,26 @@ export async function extractReadableContent(params: {
|
||||
url: string;
|
||||
extractMode: ExtractMode;
|
||||
}): Promise<{ text: string; title?: string } | null> {
|
||||
const cleanHtml = await sanitizeHtml(params.html);
|
||||
const fallback = (): { text: string; title?: string } => {
|
||||
const rendered = htmlToMarkdown(params.html);
|
||||
const rendered = htmlToMarkdown(cleanHtml);
|
||||
if (params.extractMode === "text") {
|
||||
const text = markdownToText(rendered.text) || normalizeWhitespace(stripTags(params.html));
|
||||
const text =
|
||||
stripInvisibleUnicode(markdownToText(rendered.text)) ||
|
||||
stripInvisibleUnicode(normalizeWhitespace(stripTags(cleanHtml)));
|
||||
return { text, title: rendered.title };
|
||||
}
|
||||
return rendered;
|
||||
return { text: stripInvisibleUnicode(rendered.text), title: rendered.title };
|
||||
};
|
||||
if (
|
||||
params.html.length > READABILITY_MAX_HTML_CHARS ||
|
||||
exceedsEstimatedHtmlNestingDepth(params.html, READABILITY_MAX_ESTIMATED_NESTING_DEPTH)
|
||||
cleanHtml.length > READABILITY_MAX_HTML_CHARS ||
|
||||
exceedsEstimatedHtmlNestingDepth(cleanHtml, READABILITY_MAX_ESTIMATED_NESTING_DEPTH)
|
||||
) {
|
||||
return fallback();
|
||||
}
|
||||
try {
|
||||
const { Readability, parseHTML } = await loadReadabilityDeps();
|
||||
const { document } = parseHTML(params.html);
|
||||
const { document } = parseHTML(cleanHtml);
|
||||
try {
|
||||
(document as { baseURI?: string }).baseURI = params.url;
|
||||
} catch {
|
||||
@@ -238,11 +243,11 @@ export async function extractReadableContent(params: {
|
||||
}
|
||||
const title = parsed.title || undefined;
|
||||
if (params.extractMode === "text") {
|
||||
const text = normalizeWhitespace(parsed.textContent ?? "");
|
||||
const text = stripInvisibleUnicode(normalizeWhitespace(parsed.textContent ?? ""));
|
||||
return text ? { text, title } : fallback();
|
||||
}
|
||||
const rendered = htmlToMarkdown(parsed.content);
|
||||
return { text: rendered.text, title: title ?? rendered.title };
|
||||
return { text: stripInvisibleUnicode(rendered.text), title: title ?? rendered.title };
|
||||
} catch {
|
||||
return fallback();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user