mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-12 15:21:10 +00:00
feat: add PDF analysis tool with native provider support (#31319)
* feat: add PDF analysis tool with native provider support New `pdf` tool for analyzing PDF documents with model-powered analysis. Architecture: - Native PDF path: sends raw PDF bytes directly to providers that support inline document input (Anthropic via DocumentBlockParam, Google Gemini via inlineData with application/pdf MIME type) - Extraction fallback: for providers without native PDF support, extracts text via pdfjs-dist and rasterizes pages to images via @napi-rs/canvas, then sends through the standard vision/text completion path Key features: - Single PDF (`pdf` param) or multiple PDFs (`pdfs` array, up to 10) - Page range selection (`pages` param, e.g. "1-5", "1,3,7-9") - Model override (`model` param) and file size limits (`maxBytesMb`) - Auto-detects provider capability and falls back gracefully - Same security patterns as image tool (SSRF guards, sandbox support, local path roots, workspace-only policy) Config (agents.defaults): - pdfModel: primary/fallbacks (defaults to imageModel, then session model) - pdfMaxBytesMb: max PDF file size (default: 10) - pdfMaxPages: max pages to process (default: 20) Model catalog: - Extended ModelInputType to include "document" alongside "text"/"image" - Added modelSupportsDocument() capability check Files: - src/agents/tools/pdf-tool.ts - main tool factory - src/agents/tools/pdf-tool.helpers.ts - helpers (page range, config, etc.) - src/agents/tools/pdf-native-providers.ts - direct API calls for Anthropic/Google - src/agents/tools/pdf-tool.test.ts - 43 tests covering all paths - Modified: model-catalog.ts, openclaw-tools.ts, config schema/types/labels/help * fix: prepare pdf tool for merge (#31319) (thanks @tyler6204)
This commit is contained in:
103
src/agents/tools/pdf-tool.helpers.ts
Normal file
103
src/agents/tools/pdf-tool.helpers.ts
Normal file
@@ -0,0 +1,103 @@
|
||||
import type { AssistantMessage } from "@mariozechner/pi-ai";
|
||||
import type { OpenClawConfig } from "../../config/config.js";
|
||||
import {
|
||||
resolveAgentModelFallbackValues,
|
||||
resolveAgentModelPrimaryValue,
|
||||
} from "../../config/model-input.js";
|
||||
import { extractAssistantText } from "../pi-embedded-utils.js";
|
||||
|
||||
export type PdfModelConfig = { primary?: string; fallbacks?: string[] };
|
||||
|
||||
/**
|
||||
* Providers known to support native PDF document input.
|
||||
* When the model's provider is in this set, the tool sends raw PDF bytes
|
||||
* via provider-specific API calls instead of extracting text/images first.
|
||||
*/
|
||||
export const NATIVE_PDF_PROVIDERS = new Set(["anthropic", "google"]);
|
||||
|
||||
/**
|
||||
* Check whether a provider supports native PDF document input.
|
||||
*/
|
||||
export function providerSupportsNativePdf(provider: string): boolean {
|
||||
return NATIVE_PDF_PROVIDERS.has(provider.toLowerCase().trim());
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a page range string (e.g. "1-5", "3", "1-3,7-9") into an array of 1-based page numbers.
|
||||
*/
|
||||
export function parsePageRange(range: string, maxPages: number): number[] {
|
||||
const pages = new Set<number>();
|
||||
const parts = range.split(",").map((p) => p.trim());
|
||||
for (const part of parts) {
|
||||
if (!part) {
|
||||
continue;
|
||||
}
|
||||
const dashMatch = /^(\d+)\s*-\s*(\d+)$/.exec(part);
|
||||
if (dashMatch) {
|
||||
const start = Number(dashMatch[1]);
|
||||
const end = Number(dashMatch[2]);
|
||||
if (!Number.isFinite(start) || !Number.isFinite(end) || start < 1 || end < start) {
|
||||
throw new Error(`Invalid page range: "${part}"`);
|
||||
}
|
||||
for (let i = start; i <= Math.min(end, maxPages); i++) {
|
||||
pages.add(i);
|
||||
}
|
||||
} else {
|
||||
const num = Number(part);
|
||||
if (!Number.isFinite(num) || num < 1) {
|
||||
throw new Error(`Invalid page number: "${part}"`);
|
||||
}
|
||||
if (num <= maxPages) {
|
||||
pages.add(num);
|
||||
}
|
||||
}
|
||||
}
|
||||
return Array.from(pages).toSorted((a, b) => a - b);
|
||||
}
|
||||
|
||||
export function coercePdfAssistantText(params: {
|
||||
message: AssistantMessage;
|
||||
provider: string;
|
||||
model: string;
|
||||
}): string {
|
||||
const stop = params.message.stopReason;
|
||||
const errorMessage = params.message.errorMessage?.trim();
|
||||
if (stop === "error" || stop === "aborted") {
|
||||
throw new Error(
|
||||
errorMessage
|
||||
? `PDF model failed (${params.provider}/${params.model}): ${errorMessage}`
|
||||
: `PDF model failed (${params.provider}/${params.model})`,
|
||||
);
|
||||
}
|
||||
if (errorMessage) {
|
||||
throw new Error(`PDF model failed (${params.provider}/${params.model}): ${errorMessage}`);
|
||||
}
|
||||
const text = extractAssistantText(params.message);
|
||||
if (text.trim()) {
|
||||
return text.trim();
|
||||
}
|
||||
throw new Error(`PDF model returned no text (${params.provider}/${params.model}).`);
|
||||
}
|
||||
|
||||
export function coercePdfModelConfig(cfg?: OpenClawConfig): PdfModelConfig {
|
||||
const primary = resolveAgentModelPrimaryValue(cfg?.agents?.defaults?.pdfModel);
|
||||
const fallbacks = resolveAgentModelFallbackValues(cfg?.agents?.defaults?.pdfModel);
|
||||
return {
|
||||
...(primary?.trim() ? { primary: primary.trim() } : {}),
|
||||
...(fallbacks.length > 0 ? { fallbacks } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
export function resolvePdfToolMaxTokens(
|
||||
modelMaxTokens: number | undefined,
|
||||
requestedMaxTokens = 4096,
|
||||
) {
|
||||
if (
|
||||
typeof modelMaxTokens !== "number" ||
|
||||
!Number.isFinite(modelMaxTokens) ||
|
||||
modelMaxTokens <= 0
|
||||
) {
|
||||
return requestedMaxTokens;
|
||||
}
|
||||
return Math.min(requestedMaxTokens, modelMaxTokens);
|
||||
}
|
||||
Reference in New Issue
Block a user