mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-09 03:18:26 +00:00
Web: trim HTML error bodies in web_fetch (#1193)
* Web: trim HTML error bodies in web_fetch * fix: trim web_fetch HTML error bodies (#1193) (thanks @sebslight) --------- Co-authored-by: Sebastian Slight <sbarrios93@gmail.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
This commit is contained in:
@@ -18,6 +18,7 @@ import {
|
||||
} from "./web-shared.js";
|
||||
import {
|
||||
extractReadableContent,
|
||||
htmlToMarkdown,
|
||||
markdownToText,
|
||||
truncateText,
|
||||
type ExtractMode,
|
||||
@@ -28,6 +29,7 @@ export { extractReadableContent } from "./web-fetch-utils.js";
|
||||
const EXTRACT_MODES = ["markdown", "text"] as const;
|
||||
|
||||
const DEFAULT_FETCH_MAX_CHARS = 50_000;
|
||||
const DEFAULT_ERROR_MAX_CHARS = 4_000;
|
||||
const DEFAULT_FIRECRAWL_BASE_URL = "https://api.firecrawl.dev";
|
||||
const DEFAULT_FIRECRAWL_MAX_AGE_MS = 172_800_000;
|
||||
const DEFAULT_FETCH_USER_AGENT =
|
||||
@@ -142,6 +144,30 @@ function resolveMaxChars(value: unknown, fallback: number): number {
|
||||
return Math.max(100, Math.floor(parsed));
|
||||
}
|
||||
|
||||
function looksLikeHtml(value: string): boolean {
|
||||
const trimmed = value.trimStart();
|
||||
if (!trimmed) return false;
|
||||
const head = trimmed.slice(0, 256).toLowerCase();
|
||||
return head.startsWith("<!doctype html") || head.startsWith("<html");
|
||||
}
|
||||
|
||||
function formatWebFetchErrorDetail(params: {
|
||||
detail: string;
|
||||
contentType?: string | null;
|
||||
maxChars: number;
|
||||
}): string {
|
||||
const { detail, contentType, maxChars } = params;
|
||||
if (!detail) return "";
|
||||
let text = detail;
|
||||
const contentTypeLower = contentType?.toLowerCase();
|
||||
if (contentTypeLower?.includes("text/html") || looksLikeHtml(detail)) {
|
||||
const rendered = htmlToMarkdown(detail);
|
||||
const withTitle = rendered.title ? `${rendered.title}\n${rendered.text}` : rendered.text;
|
||||
text = markdownToText(withTitle);
|
||||
}
|
||||
const truncated = truncateText(text.trim(), maxChars);
|
||||
return truncated.text;
|
||||
}
|
||||
export async function fetchFirecrawlContent(params: {
|
||||
url: string;
|
||||
extractMode: ExtractMode;
|
||||
@@ -329,7 +355,12 @@ async function runWebFetch(params: {
|
||||
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
|
||||
return payload;
|
||||
}
|
||||
const detail = await readResponseText(res);
|
||||
const rawDetail = await readResponseText(res);
|
||||
const detail = formatWebFetchErrorDetail({
|
||||
detail: rawDetail,
|
||||
contentType: res.headers.get("content-type"),
|
||||
maxChars: DEFAULT_ERROR_MAX_CHARS,
|
||||
});
|
||||
throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user