mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-09 07:17:40 +00:00
fix(web_fetch): cap response body before parsing
This commit is contained in:
@@ -1,5 +1,8 @@
|
||||
export type ExtractMode = "markdown" | "text";
|
||||
|
||||
const READABILITY_MAX_HTML_CHARS = 1_000_000;
|
||||
const READABILITY_MAX_ESTIMATED_NESTING_DEPTH = 3_000;
|
||||
|
||||
let readabilityDepsPromise:
|
||||
| Promise<{
|
||||
Readability: typeof import("@mozilla/readability").Readability;
|
||||
@@ -107,6 +110,100 @@ export function truncateText(
|
||||
return { text: value.slice(0, maxChars), truncated: true };
|
||||
}
|
||||
|
||||
function exceedsEstimatedHtmlNestingDepth(html: string, maxDepth: number): boolean {
|
||||
// Cheap heuristic to skip Readability+DOM parsing on pathological HTML (deep nesting => stack/memory blowups).
|
||||
// Not an HTML parser; tuned to catch attacker-controlled "<div><div>..." cases.
|
||||
const voidTags = new Set([
|
||||
"area",
|
||||
"base",
|
||||
"br",
|
||||
"col",
|
||||
"embed",
|
||||
"hr",
|
||||
"img",
|
||||
"input",
|
||||
"link",
|
||||
"meta",
|
||||
"param",
|
||||
"source",
|
||||
"track",
|
||||
"wbr",
|
||||
]);
|
||||
|
||||
let depth = 0;
|
||||
const len = html.length;
|
||||
for (let i = 0; i < len; i++) {
|
||||
if (html.charCodeAt(i) !== 60) {
|
||||
continue; // '<'
|
||||
}
|
||||
const next = html.charCodeAt(i + 1);
|
||||
if (next === 33 || next === 63) {
|
||||
continue; // <! ...> or <? ...>
|
||||
}
|
||||
|
||||
let j = i + 1;
|
||||
let closing = false;
|
||||
if (html.charCodeAt(j) === 47) {
|
||||
closing = true;
|
||||
j += 1;
|
||||
}
|
||||
|
||||
while (j < len && html.charCodeAt(j) <= 32) {
|
||||
j += 1;
|
||||
}
|
||||
|
||||
const nameStart = j;
|
||||
while (j < len) {
|
||||
const c = html.charCodeAt(j);
|
||||
const isNameChar =
|
||||
(c >= 65 && c <= 90) || // A-Z
|
||||
(c >= 97 && c <= 122) || // a-z
|
||||
(c >= 48 && c <= 57) || // 0-9
|
||||
c === 58 || // :
|
||||
c === 45; // -
|
||||
if (!isNameChar) {
|
||||
break;
|
||||
}
|
||||
j += 1;
|
||||
}
|
||||
|
||||
const tagName = html.slice(nameStart, j).toLowerCase();
|
||||
if (!tagName) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (closing) {
|
||||
depth = Math.max(0, depth - 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (voidTags.has(tagName)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Best-effort self-closing detection: scan a short window for "/>".
|
||||
let selfClosing = false;
|
||||
for (let k = j; k < len && k < j + 200; k++) {
|
||||
const c = html.charCodeAt(k);
|
||||
if (c === 62) {
|
||||
if (html.charCodeAt(k - 1) === 47) {
|
||||
selfClosing = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (selfClosing) {
|
||||
continue;
|
||||
}
|
||||
|
||||
depth += 1;
|
||||
if (depth > maxDepth) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
export async function extractReadableContent(params: {
|
||||
html: string;
|
||||
url: string;
|
||||
@@ -120,6 +217,12 @@ export async function extractReadableContent(params: {
|
||||
}
|
||||
return rendered;
|
||||
};
|
||||
if (
|
||||
params.html.length > READABILITY_MAX_HTML_CHARS ||
|
||||
exceedsEstimatedHtmlNestingDepth(params.html, READABILITY_MAX_ESTIMATED_NESTING_DEPTH)
|
||||
) {
|
||||
return fallback();
|
||||
}
|
||||
try {
|
||||
const { Readability, parseHTML } = await loadReadabilityDeps();
|
||||
const { document } = parseHTML(params.html);
|
||||
|
||||
Reference in New Issue
Block a user