feat: add PDF analysis tool with native provider support (#31319)

* feat: add PDF analysis tool with native provider support

New `pdf` tool for analyzing PDF documents with model-powered analysis.

Architecture:
- Native PDF path: sends raw PDF bytes directly to providers that support
  inline document input (Anthropic via DocumentBlockParam, Google Gemini
  via inlineData with application/pdf MIME type)
- Extraction fallback: for providers without native PDF support, extracts
  text via pdfjs-dist and rasterizes pages to images via @napi-rs/canvas,
  then sends through the standard vision/text completion path

Key features:
- Single PDF (`pdf` param) or multiple PDFs (`pdfs` array, up to 10)
- Page range selection (`pages` param, e.g. "1-5", "1,3,7-9")
- Model override (`model` param) and file size limits (`maxBytesMb`)
- Auto-detects provider capability and falls back gracefully
- Same security patterns as image tool (SSRF guards, sandbox support,
  local path roots, workspace-only policy)

Config (agents.defaults):
- pdfModel: primary/fallbacks (defaults to imageModel, then session model)
- pdfMaxBytesMb: max PDF file size (default: 10)
- pdfMaxPages: max pages to process (default: 20)

Model catalog:
- Extended ModelInputType to include "document" alongside "text"/"image"
- Added modelSupportsDocument() capability check

Files:
- src/agents/tools/pdf-tool.ts - main tool factory
- src/agents/tools/pdf-tool.helpers.ts - helpers (page range, config, etc.)
- src/agents/tools/pdf-native-providers.ts - direct API calls for Anthropic/Google
- src/agents/tools/pdf-tool.test.ts - 43 tests covering all paths
- Modified: model-catalog.ts, openclaw-tools.ts, config schema/types/labels/help

* fix: prepare pdf tool for merge (#31319) (thanks @tyler6204)
This commit is contained in:
Tyler Yust
2026-03-01 22:39:12 -08:00
committed by GitHub
parent 31b6e58a1b
commit d0ac1b0195
17 changed files with 2008 additions and 100 deletions

View File

@@ -0,0 +1,179 @@
/**
* Direct SDK/HTTP calls for providers that support native PDF document input.
* This bypasses pi-ai's content type system which does not have a "document" type.
*/
import { isRecord } from "../../utils.js";
import { normalizeSecretInput } from "../../utils/normalize-secret-input.js";
type PdfInput = {
base64: string;
filename?: string;
};
// ---------------------------------------------------------------------------
// Anthropic native PDF via Messages API
// ---------------------------------------------------------------------------
type AnthropicDocBlock = {
type: "document";
source: {
type: "base64";
media_type: "application/pdf";
data: string;
};
};
type AnthropicTextBlock = {
type: "text";
text: string;
};
type AnthropicContentBlock = AnthropicDocBlock | AnthropicTextBlock;
type AnthropicResponseContent = Array<{ type: string; text?: string }>;
export async function anthropicAnalyzePdf(params: {
apiKey: string;
modelId: string;
prompt: string;
pdfs: PdfInput[];
maxTokens?: number;
baseUrl?: string;
}): Promise<string> {
const apiKey = normalizeSecretInput(params.apiKey);
if (!apiKey) {
throw new Error("Anthropic PDF: apiKey required");
}
const content: AnthropicContentBlock[] = [];
for (const pdf of params.pdfs) {
content.push({
type: "document",
source: {
type: "base64",
media_type: "application/pdf",
data: pdf.base64,
},
});
}
content.push({ type: "text", text: params.prompt });
const baseUrl = (params.baseUrl ?? "https://api.anthropic.com").replace(/\/+$/, "");
const res = await fetch(`${baseUrl}/v1/messages`, {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-api-key": apiKey,
"anthropic-version": "2023-06-01",
"anthropic-beta": "pdfs-2024-09-25",
},
body: JSON.stringify({
model: params.modelId,
max_tokens: params.maxTokens ?? 4096,
messages: [{ role: "user", content }],
}),
});
if (!res.ok) {
const body = await res.text().catch(() => "");
throw new Error(
`Anthropic PDF request failed (${res.status} ${res.statusText})${body ? `: ${body.slice(0, 400)}` : ""}`,
);
}
const json = (await res.json().catch(() => null)) as unknown;
if (!isRecord(json)) {
throw new Error("Anthropic PDF response was not JSON.");
}
const responseContent = json.content as AnthropicResponseContent | undefined;
if (!Array.isArray(responseContent)) {
throw new Error("Anthropic PDF response missing content array.");
}
const text = responseContent
.filter((block) => block.type === "text" && typeof block.text === "string")
.map((block) => block.text!)
.join("");
if (!text.trim()) {
throw new Error("Anthropic PDF returned no text.");
}
return text.trim();
}
// ---------------------------------------------------------------------------
// Google Gemini native PDF via generateContent API
// ---------------------------------------------------------------------------
type GeminiPart = { inline_data: { mime_type: string; data: string } } | { text: string };
type GeminiCandidate = {
content?: { parts?: Array<{ text?: string }> };
};
export async function geminiAnalyzePdf(params: {
apiKey: string;
modelId: string;
prompt: string;
pdfs: PdfInput[];
baseUrl?: string;
}): Promise<string> {
const apiKey = normalizeSecretInput(params.apiKey);
if (!apiKey) {
throw new Error("Gemini PDF: apiKey required");
}
const parts: GeminiPart[] = [];
for (const pdf of params.pdfs) {
parts.push({
inline_data: {
mime_type: "application/pdf",
data: pdf.base64,
},
});
}
parts.push({ text: params.prompt });
const baseUrl = (params.baseUrl ?? "https://generativelanguage.googleapis.com").replace(
/\/+$/,
"",
);
const url = `${baseUrl}/v1beta/models/${encodeURIComponent(params.modelId)}:generateContent?key=${encodeURIComponent(apiKey)}`;
const res = await fetch(url, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
contents: [{ role: "user", parts }],
}),
});
if (!res.ok) {
const body = await res.text().catch(() => "");
throw new Error(
`Gemini PDF request failed (${res.status} ${res.statusText})${body ? `: ${body.slice(0, 400)}` : ""}`,
);
}
const json = (await res.json().catch(() => null)) as unknown;
if (!isRecord(json)) {
throw new Error("Gemini PDF response was not JSON.");
}
const candidates = json.candidates as GeminiCandidate[] | undefined;
if (!Array.isArray(candidates) || candidates.length === 0) {
throw new Error("Gemini PDF returned no candidates.");
}
const textParts = candidates[0].content?.parts?.filter((p) => typeof p.text === "string") ?? [];
const text = textParts.map((p) => p.text!).join("");
if (!text.trim()) {
throw new Error("Gemini PDF returned no text.");
}
return text.trim();
}