mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-10 21:44:32 +00:00
feat: add PDF analysis tool with native provider support (#31319)
* feat: add PDF analysis tool with native provider support New `pdf` tool for analyzing PDF documents with model-powered analysis. Architecture: - Native PDF path: sends raw PDF bytes directly to providers that support inline document input (Anthropic via DocumentBlockParam, Google Gemini via inlineData with application/pdf MIME type) - Extraction fallback: for providers without native PDF support, extracts text via pdfjs-dist and rasterizes pages to images via @napi-rs/canvas, then sends through the standard vision/text completion path Key features: - Single PDF (`pdf` param) or multiple PDFs (`pdfs` array, up to 10) - Page range selection (`pages` param, e.g. "1-5", "1,3,7-9") - Model override (`model` param) and file size limits (`maxBytesMb`) - Auto-detects provider capability and falls back gracefully - Same security patterns as image tool (SSRF guards, sandbox support, local path roots, workspace-only policy) Config (agents.defaults): - pdfModel: primary/fallbacks (defaults to imageModel, then session model) - pdfMaxBytesMb: max PDF file size (default: 10) - pdfMaxPages: max pages to process (default: 20) Model catalog: - Extended ModelInputType to include "document" alongside "text"/"image" - Added modelSupportsDocument() capability check Files: - src/agents/tools/pdf-tool.ts - main tool factory - src/agents/tools/pdf-tool.helpers.ts - helpers (page range, config, etc.) - src/agents/tools/pdf-native-providers.ts - direct API calls for Anthropic/Google - src/agents/tools/pdf-tool.test.ts - 43 tests covering all paths - Modified: model-catalog.ts, openclaw-tools.ts, config schema/types/labels/help * fix: prepare pdf tool for merge (#31319) (thanks @tyler6204)
This commit is contained in:
@@ -5,13 +5,15 @@ import { ensureOpenClawModelsJson } from "./models-config.js";
|
||||
|
||||
const log = createSubsystemLogger("model-catalog");
|
||||
|
||||
export type ModelInputType = "text" | "image" | "document";
|
||||
|
||||
export type ModelCatalogEntry = {
|
||||
id: string;
|
||||
name: string;
|
||||
provider: string;
|
||||
contextWindow?: number;
|
||||
reasoning?: boolean;
|
||||
input?: Array<"text" | "image">;
|
||||
input?: ModelInputType[];
|
||||
};
|
||||
|
||||
type DiscoveredModel = {
|
||||
@@ -20,7 +22,7 @@ type DiscoveredModel = {
|
||||
provider: string;
|
||||
contextWindow?: number;
|
||||
reasoning?: boolean;
|
||||
input?: Array<"text" | "image">;
|
||||
input?: ModelInputType[];
|
||||
};
|
||||
|
||||
type PiSdkModule = typeof import("./pi-model-discovery.js");
|
||||
@@ -60,12 +62,12 @@ function applyOpenAICodexSparkFallback(models: ModelCatalogEntry[]): void {
|
||||
});
|
||||
}
|
||||
|
||||
function normalizeConfiguredModelInput(input: unknown): Array<"text" | "image"> | undefined {
|
||||
function normalizeConfiguredModelInput(input: unknown): ModelInputType[] | undefined {
|
||||
if (!Array.isArray(input)) {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = input.filter(
|
||||
(item): item is "text" | "image" => item === "text" || item === "image",
|
||||
(item): item is ModelInputType => item === "text" || item === "image" || item === "document",
|
||||
);
|
||||
return normalized.length > 0 ? normalized : undefined;
|
||||
}
|
||||
@@ -248,6 +250,13 @@ export function modelSupportsVision(entry: ModelCatalogEntry | undefined): boole
|
||||
return entry?.input?.includes("image") ?? false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a model supports native document/PDF input based on its catalog entry.
|
||||
*/
|
||||
export function modelSupportsDocument(entry: ModelCatalogEntry | undefined): boolean {
|
||||
return entry?.input?.includes("document") ?? false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find a model in the catalog by provider and model ID.
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user