feat: add PDF analysis tool with native provider support (#31319)

* feat: add PDF analysis tool with native provider support

New `pdf` tool for analyzing PDF documents with model-powered analysis.

Architecture:
- Native PDF path: sends raw PDF bytes directly to providers that support
  inline document input (Anthropic via DocumentBlockParam, Google Gemini
  via inlineData with application/pdf MIME type)
- Extraction fallback: for providers without native PDF support, extracts
  text via pdfjs-dist and rasterizes pages to images via @napi-rs/canvas,
  then sends through the standard vision/text completion path

Key features:
- Single PDF (`pdf` param) or multiple PDFs (`pdfs` array, up to 10)
- Page range selection (`pages` param, e.g. "1-5", "1,3,7-9")
- Model override (`model` param) and file size limits (`maxBytesMb`)
- Auto-detects provider capability and falls back gracefully
- Same security patterns as image tool (SSRF guards, sandbox support,
  local path roots, workspace-only policy)

Config (agents.defaults):
- pdfModel: primary/fallbacks (defaults to imageModel, then session model)
- pdfMaxBytesMb: max PDF file size (default: 10)
- pdfMaxPages: max pages to process (default: 20)

Model catalog:
- Extended ModelInputType to include "document" alongside "text"/"image"
- Added modelSupportsDocument() capability check

Files:
- src/agents/tools/pdf-tool.ts - main tool factory
- src/agents/tools/pdf-tool.helpers.ts - helpers (page range, config, etc.)
- src/agents/tools/pdf-native-providers.ts - direct API calls for Anthropic/Google
- src/agents/tools/pdf-tool.test.ts - 43 tests covering all paths
- Modified: model-catalog.ts, openclaw-tools.ts, config schema/types/labels/help

* fix: prepare pdf tool for merge (#31319) (thanks @tyler6204)
This commit is contained in:
Tyler Yust
2026-03-01 22:39:12 -08:00
committed by GitHub
parent 31b6e58a1b
commit d0ac1b0195
17 changed files with 2008 additions and 100 deletions

View File

@@ -5,13 +5,15 @@ import { ensureOpenClawModelsJson } from "./models-config.js";
const log = createSubsystemLogger("model-catalog");
export type ModelInputType = "text" | "image" | "document";
export type ModelCatalogEntry = {
id: string;
name: string;
provider: string;
contextWindow?: number;
reasoning?: boolean;
input?: Array<"text" | "image">;
input?: ModelInputType[];
};
type DiscoveredModel = {
@@ -20,7 +22,7 @@ type DiscoveredModel = {
provider: string;
contextWindow?: number;
reasoning?: boolean;
input?: Array<"text" | "image">;
input?: ModelInputType[];
};
type PiSdkModule = typeof import("./pi-model-discovery.js");
@@ -60,12 +62,12 @@ function applyOpenAICodexSparkFallback(models: ModelCatalogEntry[]): void {
});
}
function normalizeConfiguredModelInput(input: unknown): Array<"text" | "image"> | undefined {
function normalizeConfiguredModelInput(input: unknown): ModelInputType[] | undefined {
if (!Array.isArray(input)) {
return undefined;
}
const normalized = input.filter(
(item): item is "text" | "image" => item === "text" || item === "image",
(item): item is ModelInputType => item === "text" || item === "image" || item === "document",
);
return normalized.length > 0 ? normalized : undefined;
}
@@ -248,6 +250,13 @@ export function modelSupportsVision(entry: ModelCatalogEntry | undefined): boole
return entry?.input?.includes("image") ?? false;
}
/**
* Check if a model supports native document/PDF input based on its catalog entry.
*/
export function modelSupportsDocument(entry: ModelCatalogEntry | undefined): boolean {
return entry?.input?.includes("document") ?? false;
}
/**
* Find a model in the catalog by provider and model ID.
*/