mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-09 12:37:40 +00:00
fix(pi): stop history image reinjection token blowup
This commit is contained in:
@@ -36,8 +36,6 @@ export interface DetectedImageRef {
|
||||
type: "path" | "url";
|
||||
/** The resolved/normalized path or URL */
|
||||
resolved: string;
|
||||
/** Index of the message this ref was found in (for history images) */
|
||||
messageIndex?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -268,93 +266,6 @@ export function modelSupportsImages(model: { input?: string[] }): boolean {
|
||||
return model.input?.includes("image") ?? false;
|
||||
}
|
||||
|
||||
function extractTextFromMessage(message: unknown): string {
|
||||
if (!message || typeof message !== "object") {
|
||||
return "";
|
||||
}
|
||||
const content = (message as { content?: unknown }).content;
|
||||
if (typeof content === "string") {
|
||||
return content;
|
||||
}
|
||||
if (!Array.isArray(content)) {
|
||||
return "";
|
||||
}
|
||||
const textParts: string[] = [];
|
||||
for (const part of content) {
|
||||
if (!part || typeof part !== "object") {
|
||||
continue;
|
||||
}
|
||||
const record = part as Record<string, unknown>;
|
||||
if (record.type === "text" && typeof record.text === "string") {
|
||||
textParts.push(record.text);
|
||||
}
|
||||
}
|
||||
return textParts.join("\n").trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts image references from conversation history messages.
|
||||
* Scans user messages for image paths/URLs that can be loaded.
|
||||
* Each ref includes the messageIndex so images can be injected at their original location.
|
||||
*
|
||||
* Note: Global deduplication is intentional - if the same image appears in multiple
|
||||
* messages, we only inject it at the FIRST occurrence. This is sufficient because:
|
||||
* 1. The model sees all message content including the image
|
||||
* 2. Later references to "the image" or "that picture" will work since it's in context
|
||||
* 3. Injecting duplicates would waste tokens and potentially hit size limits
|
||||
*/
|
||||
function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] {
|
||||
const allRefs: DetectedImageRef[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
const messageHasImageContent = (msg: unknown): boolean => {
|
||||
if (!msg || typeof msg !== "object") {
|
||||
return false;
|
||||
}
|
||||
const content = (msg as { content?: unknown }).content;
|
||||
if (!Array.isArray(content)) {
|
||||
return false;
|
||||
}
|
||||
return content.some(
|
||||
(part) =>
|
||||
part != null && typeof part === "object" && (part as { type?: string }).type === "image",
|
||||
);
|
||||
};
|
||||
|
||||
for (let i = 0; i < messages.length; i++) {
|
||||
const msg = messages[i];
|
||||
if (!msg || typeof msg !== "object") {
|
||||
continue;
|
||||
}
|
||||
const message = msg as { role?: string };
|
||||
// Only scan user messages for image references
|
||||
if (message.role !== "user") {
|
||||
continue;
|
||||
}
|
||||
// Skip if message already has image content (prevents reloading each turn)
|
||||
if (messageHasImageContent(msg)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const text = extractTextFromMessage(msg);
|
||||
if (!text) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const refs = detectImageReferences(text);
|
||||
for (const ref of refs) {
|
||||
const key = ref.resolved.toLowerCase();
|
||||
if (seen.has(key)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(key);
|
||||
allRefs.push({ ...ref, messageIndex: i });
|
||||
}
|
||||
}
|
||||
|
||||
return allRefs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects and loads images referenced in a prompt for models with vision capability.
|
||||
*
|
||||
@@ -362,18 +273,14 @@ function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] {
|
||||
* loads them, and returns them as ImageContent array ready to be passed to
|
||||
* the model's prompt method.
|
||||
*
|
||||
* Also scans conversation history for images from previous turns and returns
|
||||
* them mapped by message index so they can be injected at their original location.
|
||||
*
|
||||
* @param params Configuration for image detection and loading
|
||||
* @returns Object with loaded images for current prompt and history images by message index
|
||||
* @returns Object with loaded images for current prompt only
|
||||
*/
|
||||
export async function detectAndLoadPromptImages(params: {
|
||||
prompt: string;
|
||||
workspaceDir: string;
|
||||
model: { input?: string[] };
|
||||
existingImages?: ImageContent[];
|
||||
historyMessages?: unknown[];
|
||||
maxBytes?: number;
|
||||
maxDimensionPx?: number;
|
||||
workspaceOnly?: boolean;
|
||||
@@ -381,8 +288,6 @@ export async function detectAndLoadPromptImages(params: {
|
||||
}): Promise<{
|
||||
/** Images for the current prompt (existingImages + detected in current prompt) */
|
||||
images: ImageContent[];
|
||||
/** Images from history messages, keyed by message index */
|
||||
historyImagesByIndex: Map<number, ImageContent[]>;
|
||||
detectedRefs: DetectedImageRef[];
|
||||
loadedCount: number;
|
||||
skippedCount: number;
|
||||
@@ -391,7 +296,6 @@ export async function detectAndLoadPromptImages(params: {
|
||||
if (!modelSupportsImages(params.model)) {
|
||||
return {
|
||||
images: [],
|
||||
historyImagesByIndex: new Map(),
|
||||
detectedRefs: [],
|
||||
loadedCount: 0,
|
||||
skippedCount: 0,
|
||||
@@ -399,38 +303,20 @@ export async function detectAndLoadPromptImages(params: {
|
||||
}
|
||||
|
||||
// Detect images from current prompt
|
||||
const promptRefs = detectImageReferences(params.prompt);
|
||||
|
||||
// Detect images from conversation history (with message indices)
|
||||
const historyRefs = params.historyMessages ? detectImagesFromHistory(params.historyMessages) : [];
|
||||
|
||||
// Deduplicate: if an image is in the current prompt, don't also load it from history.
|
||||
// Current prompt images are passed via the `images` parameter to prompt(), while history
|
||||
// images are injected into their original message positions. We don't want the same
|
||||
// image loaded and sent twice (wasting tokens and potentially causing confusion).
|
||||
const seenPaths = new Set(promptRefs.map((r) => r.resolved.toLowerCase()));
|
||||
const uniqueHistoryRefs = historyRefs.filter((r) => !seenPaths.has(r.resolved.toLowerCase()));
|
||||
|
||||
const allRefs = [...promptRefs, ...uniqueHistoryRefs];
|
||||
const allRefs = detectImageReferences(params.prompt);
|
||||
|
||||
if (allRefs.length === 0) {
|
||||
return {
|
||||
images: params.existingImages ?? [],
|
||||
historyImagesByIndex: new Map(),
|
||||
detectedRefs: [],
|
||||
loadedCount: 0,
|
||||
skippedCount: 0,
|
||||
};
|
||||
}
|
||||
|
||||
log.debug(
|
||||
`Native image: detected ${allRefs.length} image refs (${promptRefs.length} in prompt, ${uniqueHistoryRefs.length} in history)`,
|
||||
);
|
||||
log.debug(`Native image: detected ${allRefs.length} image refs in prompt`);
|
||||
|
||||
// Load images for current prompt
|
||||
const promptImages: ImageContent[] = [...(params.existingImages ?? [])];
|
||||
// Load images for history, grouped by message index
|
||||
const historyImagesByIndex = new Map<number, ImageContent[]>();
|
||||
|
||||
let loadedCount = 0;
|
||||
let skippedCount = 0;
|
||||
@@ -442,18 +328,7 @@ export async function detectAndLoadPromptImages(params: {
|
||||
sandbox: params.sandbox,
|
||||
});
|
||||
if (image) {
|
||||
if (ref.messageIndex !== undefined) {
|
||||
// History image - add to the appropriate message index
|
||||
const existing = historyImagesByIndex.get(ref.messageIndex);
|
||||
if (existing) {
|
||||
existing.push(image);
|
||||
} else {
|
||||
historyImagesByIndex.set(ref.messageIndex, [image]);
|
||||
}
|
||||
} else {
|
||||
// Current prompt image
|
||||
promptImages.push(image);
|
||||
}
|
||||
promptImages.push(image);
|
||||
loadedCount++;
|
||||
log.debug(`Native image: loaded ${ref.type} ${ref.resolved}`);
|
||||
} else {
|
||||
@@ -469,21 +344,9 @@ export async function detectAndLoadPromptImages(params: {
|
||||
"prompt:images",
|
||||
imageSanitization,
|
||||
);
|
||||
const sanitizedHistoryImagesByIndex = new Map<number, ImageContent[]>();
|
||||
for (const [index, images] of historyImagesByIndex) {
|
||||
const sanitized = await sanitizeImagesWithLog(
|
||||
images,
|
||||
`history:images:${index}`,
|
||||
imageSanitization,
|
||||
);
|
||||
if (sanitized.length > 0) {
|
||||
sanitizedHistoryImagesByIndex.set(index, sanitized);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
images: sanitizedPromptImages,
|
||||
historyImagesByIndex: sanitizedHistoryImagesByIndex,
|
||||
detectedRefs: allRefs,
|
||||
loadedCount,
|
||||
skippedCount,
|
||||
|
||||
Reference in New Issue
Block a user