feat: add PDF analysis tool with native provider support (#31319)

* feat: add PDF analysis tool with native provider support New `pdf` tool for analyzing PDF documents with model-powered analysis. Architecture: - Native PDF path: sends raw PDF bytes directly to providers that support inline document input (Anthropic via DocumentBlockParam, Google Gemini via inlineData with application/pdf MIME type) - Extraction fallback: for providers without native PDF support, extracts text via pdfjs-dist and rasterizes pages to images via @napi-rs/canvas, then sends through the standard vision/text completion path Key features: - Single PDF (`pdf` param) or multiple PDFs (`pdfs` array, up to 10) - Page range selection (`pages` param, e.g. "1-5", "1,3,7-9") - Model override (`model` param) and file size limits (`maxBytesMb`) - Auto-detects provider capability and falls back gracefully - Same security patterns as image tool (SSRF guards, sandbox support, local path roots, workspace-only policy) Config (agents.defaults): - pdfModel: primary/fallbacks (defaults to imageModel, then session model) - pdfMaxBytesMb: max PDF file size (default: 10) - pdfMaxPages: max pages to process (default: 20) Model catalog: - Extended ModelInputType to include "document" alongside "text"/"image" - Added modelSupportsDocument() capability check Files: - src/agents/tools/pdf-tool.ts - main tool factory - src/agents/tools/pdf-tool.helpers.ts - helpers (page range, config, etc.) - src/agents/tools/pdf-native-providers.ts - direct API calls for Anthropic/Google - src/agents/tools/pdf-tool.test.ts - 43 tests covering all paths - Modified: model-catalog.ts, openclaw-tools.ts, config schema/types/labels/help * fix: prepare pdf tool for merge (#31319) (thanks @tyler6204)
2026-05-12 11:51:12 +00:00 · 2026-03-01 22:39:12 -08:00
parent 31b6e58a1b
commit d0ac1b0195
17 changed files with 2008 additions and 100 deletions
--- a/src/agents/tools/pdf-native-providers.ts
+++ b/src/agents/tools/pdf-native-providers.ts
@@ -0,0 +1,179 @@
+/**
+ * Direct SDK/HTTP calls for providers that support native PDF document input.
+ * This bypasses pi-ai's content type system which does not have a "document" type.
+ */
+
+import { isRecord } from "../../utils.js";
+import { normalizeSecretInput } from "../../utils/normalize-secret-input.js";
+
+type PdfInput = {
+  base64: string;
+  filename?: string;
+};
+
+// ---------------------------------------------------------------------------
+// Anthropic – native PDF via Messages API
+// ---------------------------------------------------------------------------
+
+type AnthropicDocBlock = {
+  type: "document";
+  source: {
+    type: "base64";
+    media_type: "application/pdf";
+    data: string;
+  };
+};
+
+type AnthropicTextBlock = {
+  type: "text";
+  text: string;
+};
+
+type AnthropicContentBlock = AnthropicDocBlock | AnthropicTextBlock;
+
+type AnthropicResponseContent = Array<{ type: string; text?: string }>;
+
+export async function anthropicAnalyzePdf(params: {
+  apiKey: string;
+  modelId: string;
+  prompt: string;
+  pdfs: PdfInput[];
+  maxTokens?: number;
+  baseUrl?: string;
+}): Promise<string> {
+  const apiKey = normalizeSecretInput(params.apiKey);
+  if (!apiKey) {
+    throw new Error("Anthropic PDF: apiKey required");
+  }
+
+  const content: AnthropicContentBlock[] = [];
+  for (const pdf of params.pdfs) {
+    content.push({
+      type: "document",
+      source: {
+        type: "base64",
+        media_type: "application/pdf",
+        data: pdf.base64,
+      },
+    });
+  }
+  content.push({ type: "text", text: params.prompt });
+
+  const baseUrl = (params.baseUrl ?? "https://api.anthropic.com").replace(/\/+$/, "");
+  const res = await fetch(`${baseUrl}/v1/messages`, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+      "x-api-key": apiKey,
+      "anthropic-version": "2023-06-01",
+      "anthropic-beta": "pdfs-2024-09-25",
+    },
+    body: JSON.stringify({
+      model: params.modelId,
+      max_tokens: params.maxTokens ?? 4096,
+      messages: [{ role: "user", content }],
+    }),
+  });
+
+  if (!res.ok) {
+    const body = await res.text().catch(() => "");
+    throw new Error(
+      `Anthropic PDF request failed (${res.status} ${res.statusText})${body ? `: ${body.slice(0, 400)}` : ""}`,
+    );
+  }
+
+  const json = (await res.json().catch(() => null)) as unknown;
+  if (!isRecord(json)) {
+    throw new Error("Anthropic PDF response was not JSON.");
+  }
+
+  const responseContent = json.content as AnthropicResponseContent | undefined;
+  if (!Array.isArray(responseContent)) {
+    throw new Error("Anthropic PDF response missing content array.");
+  }
+
+  const text = responseContent
+    .filter((block) => block.type === "text" && typeof block.text === "string")
+    .map((block) => block.text!)
+    .join("");
+
+  if (!text.trim()) {
+    throw new Error("Anthropic PDF returned no text.");
+  }
+
+  return text.trim();
+}
+
+// ---------------------------------------------------------------------------
+// Google Gemini – native PDF via generateContent API
+// ---------------------------------------------------------------------------
+
+type GeminiPart = { inline_data: { mime_type: string; data: string } } | { text: string };
+
+type GeminiCandidate = {
+  content?: { parts?: Array<{ text?: string }> };
+};
+
+export async function geminiAnalyzePdf(params: {
+  apiKey: string;
+  modelId: string;
+  prompt: string;
+  pdfs: PdfInput[];
+  baseUrl?: string;
+}): Promise<string> {
+  const apiKey = normalizeSecretInput(params.apiKey);
+  if (!apiKey) {
+    throw new Error("Gemini PDF: apiKey required");
+  }
+
+  const parts: GeminiPart[] = [];
+  for (const pdf of params.pdfs) {
+    parts.push({
+      inline_data: {
+        mime_type: "application/pdf",
+        data: pdf.base64,
+      },
+    });
+  }
+  parts.push({ text: params.prompt });
+
+  const baseUrl = (params.baseUrl ?? "https://generativelanguage.googleapis.com").replace(
+    /\/+$/,
+    "",
+  );
+  const url = `${baseUrl}/v1beta/models/${encodeURIComponent(params.modelId)}:generateContent?key=${encodeURIComponent(apiKey)}`;
+
+  const res = await fetch(url, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({
+      contents: [{ role: "user", parts }],
+    }),
+  });
+
+  if (!res.ok) {
+    const body = await res.text().catch(() => "");
+    throw new Error(
+      `Gemini PDF request failed (${res.status} ${res.statusText})${body ? `: ${body.slice(0, 400)}` : ""}`,
+    );
+  }
+
+  const json = (await res.json().catch(() => null)) as unknown;
+  if (!isRecord(json)) {
+    throw new Error("Gemini PDF response was not JSON.");
+  }
+
+  const candidates = json.candidates as GeminiCandidate[] | undefined;
+  if (!Array.isArray(candidates) || candidates.length === 0) {
+    throw new Error("Gemini PDF returned no candidates.");
+  }
+
+  const textParts = candidates[0].content?.parts?.filter((p) => typeof p.text === "string") ?? [];
+  const text = textParts.map((p) => p.text!).join("");
+
+  if (!text.trim()) {
+    throw new Error("Gemini PDF returned no text.");
+  }
+
+  return text.trim();
+}