fix(web_fetch): cap response body before parsing

This commit is contained in:
Peter Steinberger
2026-02-16 01:19:04 +01:00
parent fd3d452f1f
commit 166cf6a3e0
7 changed files with 284 additions and 8 deletions

View File

@@ -1,5 +1,8 @@
export type ExtractMode = "markdown" | "text";
const READABILITY_MAX_HTML_CHARS = 1_000_000;
const READABILITY_MAX_ESTIMATED_NESTING_DEPTH = 3_000;
let readabilityDepsPromise:
| Promise<{
Readability: typeof import("@mozilla/readability").Readability;
@@ -107,6 +110,100 @@ export function truncateText(
return { text: value.slice(0, maxChars), truncated: true };
}
function exceedsEstimatedHtmlNestingDepth(html: string, maxDepth: number): boolean {
// Cheap heuristic to skip Readability+DOM parsing on pathological HTML (deep nesting => stack/memory blowups).
// Not an HTML parser; tuned to catch attacker-controlled "<div><div>..." cases.
const voidTags = new Set([
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"link",
"meta",
"param",
"source",
"track",
"wbr",
]);
let depth = 0;
const len = html.length;
for (let i = 0; i < len; i++) {
if (html.charCodeAt(i) !== 60) {
continue; // '<'
}
const next = html.charCodeAt(i + 1);
if (next === 33 || next === 63) {
continue; // <! ...> or <? ...>
}
let j = i + 1;
let closing = false;
if (html.charCodeAt(j) === 47) {
closing = true;
j += 1;
}
while (j < len && html.charCodeAt(j) <= 32) {
j += 1;
}
const nameStart = j;
while (j < len) {
const c = html.charCodeAt(j);
const isNameChar =
(c >= 65 && c <= 90) || // A-Z
(c >= 97 && c <= 122) || // a-z
(c >= 48 && c <= 57) || // 0-9
c === 58 || // :
c === 45; // -
if (!isNameChar) {
break;
}
j += 1;
}
const tagName = html.slice(nameStart, j).toLowerCase();
if (!tagName) {
continue;
}
if (closing) {
depth = Math.max(0, depth - 1);
continue;
}
if (voidTags.has(tagName)) {
continue;
}
// Best-effort self-closing detection: scan a short window for "/>".
let selfClosing = false;
for (let k = j; k < len && k < j + 200; k++) {
const c = html.charCodeAt(k);
if (c === 62) {
if (html.charCodeAt(k - 1) === 47) {
selfClosing = true;
}
break;
}
}
if (selfClosing) {
continue;
}
depth += 1;
if (depth > maxDepth) {
return true;
}
}
return false;
}
export async function extractReadableContent(params: {
html: string;
url: string;
@@ -120,6 +217,12 @@ export async function extractReadableContent(params: {
}
return rendered;
};
if (
params.html.length > READABILITY_MAX_HTML_CHARS ||
exceedsEstimatedHtmlNestingDepth(params.html, READABILITY_MAX_ESTIMATED_NESTING_DEPTH)
) {
return fallback();
}
try {
const { Readability, parseHTML } = await loadReadabilityDeps();
const { document } = parseHTML(params.html);