fix: harden web fetch SSRF and redirects

Co-authored-by: Eli <fogboots@users.noreply.github.com>
This commit is contained in:
Peter Steinberger
2026-01-21 02:52:27 +00:00
parent ec51bb700c
commit 5bd55037e4
11 changed files with 412 additions and 82 deletions

View File

@@ -1,6 +1,7 @@
import { Type } from "@sinclair/typebox";
import type { ClawdbotConfig } from "../../config/config.js";
import { assertPublicHostname, SsrFBlockedError } from "../../infra/net/ssrf.js";
import { stringEnum } from "../schema/typebox.js";
import type { AnyAgentTool } from "./common.js";
import { jsonResult, readNumberParam, readStringParam } from "./common.js";
@@ -29,6 +30,7 @@ export { extractReadableContent } from "./web-fetch-utils.js";
const EXTRACT_MODES = ["markdown", "text"] as const;
const DEFAULT_FETCH_MAX_CHARS = 50_000;
const DEFAULT_FETCH_MAX_REDIRECTS = 3;
const DEFAULT_ERROR_MAX_CHARS = 4_000;
const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev";
const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000;
@@ -144,6 +146,11 @@ function resolveMaxChars(value: unknown, fallback: number): number {
return Math.max(100, Math.floor(parsed));
}
function resolveMaxRedirects(value: unknown, fallback: number): number {
const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback;
return Math.max(0, Math.floor(parsed));
}
function looksLikeHtml(value: string): boolean {
const trimmed = value.trimStart();
if (!trimmed) return false;
@@ -151,6 +158,68 @@ function looksLikeHtml(value: string): boolean {
return head.startsWith("<!doctype html") || head.startsWith("<html");
}
function isRedirectStatus(status: number): boolean {
return status === 301 || status === 302 || status === 303 || status === 307 || status === 308;
}
async function fetchWithRedirects(params: {
url: string;
maxRedirects: number;
timeoutSeconds: number;
userAgent: string;
}): Promise<{ response: Response; finalUrl: string }> {
const signal = withTimeout(undefined, params.timeoutSeconds * 1000);
const visited = new Set<string>();
let currentUrl = params.url;
let redirectCount = 0;
while (true) {
let parsedUrl: URL;
try {
parsedUrl = new URL(currentUrl);
} catch {
throw new Error("Invalid URL: must be http or https");
}
if (!["http:", "https:"].includes(parsedUrl.protocol)) {
throw new Error("Invalid URL: must be http or https");
}
await assertPublicHostname(parsedUrl.hostname);
const res = await fetch(parsedUrl.toString(), {
method: "GET",
headers: {
Accept: "*/*",
"User-Agent": params.userAgent,
"Accept-Language": "en-US,en;q=0.9",
},
signal,
redirect: "manual",
});
if (isRedirectStatus(res.status)) {
const location = res.headers.get("location");
if (!location) {
throw new Error(`Redirect missing location header (${res.status})`);
}
redirectCount += 1;
if (redirectCount > params.maxRedirects) {
throw new Error(`Too many redirects (limit: ${params.maxRedirects})`);
}
const nextUrl = new URL(location, parsedUrl).toString();
if (visited.has(nextUrl)) {
throw new Error("Redirect loop detected");
}
visited.add(nextUrl);
void res.body?.cancel();
currentUrl = nextUrl;
continue;
}
return { response: res, finalUrl: currentUrl };
}
}
function formatWebFetchErrorDetail(params: {
detail: string;
contentType?: string | null;
@@ -247,6 +316,7 @@ async function runWebFetch(params: {
url: string;
extractMode: ExtractMode;
maxChars: number;
maxRedirects: number;
timeoutSeconds: number;
cacheTtlMs: number;
userAgent: string;
@@ -278,20 +348,23 @@ async function runWebFetch(params: {
const start = Date.now();
let res: Response;
let finalUrl = params.url;
try {
res = await fetch(parsedUrl.toString(), {
method: "GET",
headers: {
Accept: "*/*",
"User-Agent": params.userAgent,
"Accept-Language": "en-US,en;q=0.9",
},
signal: withTimeout(undefined, params.timeoutSeconds * 1000),
const result = await fetchWithRedirects({
url: params.url,
maxRedirects: params.maxRedirects,
timeoutSeconds: params.timeoutSeconds,
userAgent: params.userAgent,
});
res = result.response;
finalUrl = result.finalUrl;
} catch (error) {
if (error instanceof SsrFBlockedError) {
throw error;
}
if (params.firecrawlEnabled && params.firecrawlApiKey) {
const firecrawl = await fetchFirecrawlContent({
url: params.url,
url: finalUrl,
extractMode: params.extractMode,
apiKey: params.firecrawlApiKey,
baseUrl: params.firecrawlBaseUrl,
@@ -304,7 +377,7 @@ async function runWebFetch(params: {
const truncated = truncateText(firecrawl.text, params.maxChars);
const payload = {
url: params.url,
finalUrl: firecrawl.finalUrl || params.url,
finalUrl: firecrawl.finalUrl || finalUrl,
status: firecrawl.status ?? 200,
contentType: "text/markdown",
title: firecrawl.title,
@@ -339,7 +412,7 @@ async function runWebFetch(params: {
const truncated = truncateText(firecrawl.text, params.maxChars);
const payload = {
url: params.url,
finalUrl: firecrawl.finalUrl || params.url,
finalUrl: firecrawl.finalUrl || finalUrl,
status: firecrawl.status ?? res.status,
contentType: "text/markdown",
title: firecrawl.title,
@@ -374,7 +447,7 @@ async function runWebFetch(params: {
if (params.readabilityEnabled) {
const readable = await extractReadableContent({
html: body,
url: res.url || params.url,
url: finalUrl,
extractMode: params.extractMode,
});
if (readable?.text) {
@@ -382,7 +455,7 @@ async function runWebFetch(params: {
title = readable.title;
extractor = "readability";
} else {
const firecrawl = await tryFirecrawlFallback(params);
const firecrawl = await tryFirecrawlFallback({ ...params, url: finalUrl });
if (firecrawl) {
text = firecrawl.text;
title = firecrawl.title;
@@ -411,7 +484,7 @@ async function runWebFetch(params: {
const truncated = truncateText(text, params.maxChars);
const payload = {
url: params.url,
finalUrl: res.url || params.url,
finalUrl,
status: res.status,
contentType,
title,
@@ -508,6 +581,7 @@ export function createWebFetchTool(options?: {
url,
extractMode,
maxChars: resolveMaxChars(maxChars ?? fetch?.maxChars, DEFAULT_FETCH_MAX_CHARS),
maxRedirects: resolveMaxRedirects(fetch?.maxRedirects, DEFAULT_FETCH_MAX_REDIRECTS),
timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS),
cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES),
userAgent,