feat: add PDF analysis tool with native provider support (#31319)

* feat: add PDF analysis tool with native provider support New `pdf` tool for analyzing PDF documents with model-powered analysis. Architecture: - Native PDF path: sends raw PDF bytes directly to providers that support inline document input (Anthropic via DocumentBlockParam, Google Gemini via inlineData with application/pdf MIME type) - Extraction fallback: for providers without native PDF support, extracts text via pdfjs-dist and rasterizes pages to images via @napi-rs/canvas, then sends through the standard vision/text completion path Key features: - Single PDF (`pdf` param) or multiple PDFs (`pdfs` array, up to 10) - Page range selection (`pages` param, e.g. "1-5", "1,3,7-9") - Model override (`model` param) and file size limits (`maxBytesMb`) - Auto-detects provider capability and falls back gracefully - Same security patterns as image tool (SSRF guards, sandbox support, local path roots, workspace-only policy) Config (agents.defaults): - pdfModel: primary/fallbacks (defaults to imageModel, then session model) - pdfMaxBytesMb: max PDF file size (default: 10) - pdfMaxPages: max pages to process (default: 20) Model catalog: - Extended ModelInputType to include "document" alongside "text"/"image" - Added modelSupportsDocument() capability check Files: - src/agents/tools/pdf-tool.ts - main tool factory - src/agents/tools/pdf-tool.helpers.ts - helpers (page range, config, etc.) - src/agents/tools/pdf-native-providers.ts - direct API calls for Anthropic/Google - src/agents/tools/pdf-tool.test.ts - 43 tests covering all paths - Modified: model-catalog.ts, openclaw-tools.ts, config schema/types/labels/help * fix: prepare pdf tool for merge (#31319) (thanks @tyler6204)
2026-05-10 21:44:32 +00:00 · 2026-03-01 22:39:12 -08:00
parent 31b6e58a1b
commit d0ac1b0195
17 changed files with 2008 additions and 100 deletions
--- a/src/agents/model-catalog.ts
+++ b/src/agents/model-catalog.ts
@@ -5,13 +5,15 @@ import { ensureOpenClawModelsJson } from "./models-config.js";

 const log = createSubsystemLogger("model-catalog");

+export type ModelInputType = "text" | "image" | "document";
+
 export type ModelCatalogEntry = {
  id: string;
  name: string;
  provider: string;
  contextWindow?: number;
  reasoning?: boolean;
-  input?: Array<"text" | "image">;
+  input?: ModelInputType[];
 };

 type DiscoveredModel = {
@@ -20,7 +22,7 @@ type DiscoveredModel = {
  provider: string;
  contextWindow?: number;
  reasoning?: boolean;
-  input?: Array<"text" | "image">;
+  input?: ModelInputType[];
 };

 type PiSdkModule = typeof import("./pi-model-discovery.js");
@@ -60,12 +62,12 @@ function applyOpenAICodexSparkFallback(models: ModelCatalogEntry[]): void {
  });
 }

-function normalizeConfiguredModelInput(input: unknown): Array<"text" | "image"> | undefined {
+function normalizeConfiguredModelInput(input: unknown): ModelInputType[] | undefined {
  if (!Array.isArray(input)) {
    return undefined;
  }
  const normalized = input.filter(
-    (item): item is "text" | "image" => item === "text" || item === "image",
+    (item): item is ModelInputType => item === "text" || item === "image" || item === "document",
  );
  return normalized.length > 0 ? normalized : undefined;
 }
@@ -248,6 +250,13 @@ export function modelSupportsVision(entry: ModelCatalogEntry | undefined): boole
  return entry?.input?.includes("image") ?? false;
 }

+/**
+ * Check if a model supports native document/PDF input based on its catalog entry.
+ */
+export function modelSupportsDocument(entry: ModelCatalogEntry | undefined): boolean {
+  return entry?.input?.includes("document") ?? false;
+}
+
 /**
 * Find a model in the catalog by provider and model ID.
 */