mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-12 11:51:12 +00:00
feat: add PDF analysis tool with native provider support (#31319)
* feat: add PDF analysis tool with native provider support New `pdf` tool for analyzing PDF documents with model-powered analysis. Architecture: - Native PDF path: sends raw PDF bytes directly to providers that support inline document input (Anthropic via DocumentBlockParam, Google Gemini via inlineData with application/pdf MIME type) - Extraction fallback: for providers without native PDF support, extracts text via pdfjs-dist and rasterizes pages to images via @napi-rs/canvas, then sends through the standard vision/text completion path Key features: - Single PDF (`pdf` param) or multiple PDFs (`pdfs` array, up to 10) - Page range selection (`pages` param, e.g. "1-5", "1,3,7-9") - Model override (`model` param) and file size limits (`maxBytesMb`) - Auto-detects provider capability and falls back gracefully - Same security patterns as image tool (SSRF guards, sandbox support, local path roots, workspace-only policy) Config (agents.defaults): - pdfModel: primary/fallbacks (defaults to imageModel, then session model) - pdfMaxBytesMb: max PDF file size (default: 10) - pdfMaxPages: max pages to process (default: 20) Model catalog: - Extended ModelInputType to include "document" alongside "text"/"image" - Added modelSupportsDocument() capability check Files: - src/agents/tools/pdf-tool.ts - main tool factory - src/agents/tools/pdf-tool.helpers.ts - helpers (page range, config, etc.) - src/agents/tools/pdf-native-providers.ts - direct API calls for Anthropic/Google - src/agents/tools/pdf-tool.test.ts - 43 tests covering all paths - Modified: model-catalog.ts, openclaw-tools.ts, config schema/types/labels/help * fix: prepare pdf tool for merge (#31319) (thanks @tyler6204)
This commit is contained in:
179
src/agents/tools/pdf-native-providers.ts
Normal file
179
src/agents/tools/pdf-native-providers.ts
Normal file
@@ -0,0 +1,179 @@
|
||||
/**
|
||||
* Direct SDK/HTTP calls for providers that support native PDF document input.
|
||||
* This bypasses pi-ai's content type system which does not have a "document" type.
|
||||
*/
|
||||
|
||||
import { isRecord } from "../../utils.js";
|
||||
import { normalizeSecretInput } from "../../utils/normalize-secret-input.js";
|
||||
|
||||
type PdfInput = {
|
||||
base64: string;
|
||||
filename?: string;
|
||||
};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Anthropic – native PDF via Messages API
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type AnthropicDocBlock = {
|
||||
type: "document";
|
||||
source: {
|
||||
type: "base64";
|
||||
media_type: "application/pdf";
|
||||
data: string;
|
||||
};
|
||||
};
|
||||
|
||||
type AnthropicTextBlock = {
|
||||
type: "text";
|
||||
text: string;
|
||||
};
|
||||
|
||||
type AnthropicContentBlock = AnthropicDocBlock | AnthropicTextBlock;
|
||||
|
||||
type AnthropicResponseContent = Array<{ type: string; text?: string }>;
|
||||
|
||||
export async function anthropicAnalyzePdf(params: {
|
||||
apiKey: string;
|
||||
modelId: string;
|
||||
prompt: string;
|
||||
pdfs: PdfInput[];
|
||||
maxTokens?: number;
|
||||
baseUrl?: string;
|
||||
}): Promise<string> {
|
||||
const apiKey = normalizeSecretInput(params.apiKey);
|
||||
if (!apiKey) {
|
||||
throw new Error("Anthropic PDF: apiKey required");
|
||||
}
|
||||
|
||||
const content: AnthropicContentBlock[] = [];
|
||||
for (const pdf of params.pdfs) {
|
||||
content.push({
|
||||
type: "document",
|
||||
source: {
|
||||
type: "base64",
|
||||
media_type: "application/pdf",
|
||||
data: pdf.base64,
|
||||
},
|
||||
});
|
||||
}
|
||||
content.push({ type: "text", text: params.prompt });
|
||||
|
||||
const baseUrl = (params.baseUrl ?? "https://api.anthropic.com").replace(/\/+$/, "");
|
||||
const res = await fetch(`${baseUrl}/v1/messages`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"x-api-key": apiKey,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"anthropic-beta": "pdfs-2024-09-25",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: params.modelId,
|
||||
max_tokens: params.maxTokens ?? 4096,
|
||||
messages: [{ role: "user", content }],
|
||||
}),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const body = await res.text().catch(() => "");
|
||||
throw new Error(
|
||||
`Anthropic PDF request failed (${res.status} ${res.statusText})${body ? `: ${body.slice(0, 400)}` : ""}`,
|
||||
);
|
||||
}
|
||||
|
||||
const json = (await res.json().catch(() => null)) as unknown;
|
||||
if (!isRecord(json)) {
|
||||
throw new Error("Anthropic PDF response was not JSON.");
|
||||
}
|
||||
|
||||
const responseContent = json.content as AnthropicResponseContent | undefined;
|
||||
if (!Array.isArray(responseContent)) {
|
||||
throw new Error("Anthropic PDF response missing content array.");
|
||||
}
|
||||
|
||||
const text = responseContent
|
||||
.filter((block) => block.type === "text" && typeof block.text === "string")
|
||||
.map((block) => block.text!)
|
||||
.join("");
|
||||
|
||||
if (!text.trim()) {
|
||||
throw new Error("Anthropic PDF returned no text.");
|
||||
}
|
||||
|
||||
return text.trim();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Google Gemini – native PDF via generateContent API
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type GeminiPart = { inline_data: { mime_type: string; data: string } } | { text: string };
|
||||
|
||||
type GeminiCandidate = {
|
||||
content?: { parts?: Array<{ text?: string }> };
|
||||
};
|
||||
|
||||
export async function geminiAnalyzePdf(params: {
|
||||
apiKey: string;
|
||||
modelId: string;
|
||||
prompt: string;
|
||||
pdfs: PdfInput[];
|
||||
baseUrl?: string;
|
||||
}): Promise<string> {
|
||||
const apiKey = normalizeSecretInput(params.apiKey);
|
||||
if (!apiKey) {
|
||||
throw new Error("Gemini PDF: apiKey required");
|
||||
}
|
||||
|
||||
const parts: GeminiPart[] = [];
|
||||
for (const pdf of params.pdfs) {
|
||||
parts.push({
|
||||
inline_data: {
|
||||
mime_type: "application/pdf",
|
||||
data: pdf.base64,
|
||||
},
|
||||
});
|
||||
}
|
||||
parts.push({ text: params.prompt });
|
||||
|
||||
const baseUrl = (params.baseUrl ?? "https://generativelanguage.googleapis.com").replace(
|
||||
/\/+$/,
|
||||
"",
|
||||
);
|
||||
const url = `${baseUrl}/v1beta/models/${encodeURIComponent(params.modelId)}:generateContent?key=${encodeURIComponent(apiKey)}`;
|
||||
|
||||
const res = await fetch(url, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
contents: [{ role: "user", parts }],
|
||||
}),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const body = await res.text().catch(() => "");
|
||||
throw new Error(
|
||||
`Gemini PDF request failed (${res.status} ${res.statusText})${body ? `: ${body.slice(0, 400)}` : ""}`,
|
||||
);
|
||||
}
|
||||
|
||||
const json = (await res.json().catch(() => null)) as unknown;
|
||||
if (!isRecord(json)) {
|
||||
throw new Error("Gemini PDF response was not JSON.");
|
||||
}
|
||||
|
||||
const candidates = json.candidates as GeminiCandidate[] | undefined;
|
||||
if (!Array.isArray(candidates) || candidates.length === 0) {
|
||||
throw new Error("Gemini PDF returned no candidates.");
|
||||
}
|
||||
|
||||
const textParts = candidates[0].content?.parts?.filter((p) => typeof p.text === "string") ?? [];
|
||||
const text = textParts.map((p) => p.text!).join("");
|
||||
|
||||
if (!text.trim()) {
|
||||
throw new Error("Gemini PDF returned no text.");
|
||||
}
|
||||
|
||||
return text.trim();
|
||||
}
|
||||
Reference in New Issue
Block a user