fix(telegram): improve sticker vision + cache (#2548) (thanks @longjos)

This commit is contained in:
Ayaan Zaidi
2026-01-27 12:47:04 +05:30
committed by Ayaan Zaidi
parent 506bed5aed
commit 34fea720f8
11 changed files with 240 additions and 37 deletions

View File

@@ -139,6 +139,7 @@ export const dispatchTelegramMessage = async ({
imagePath: ctxPayload.MediaPath,
cfg,
agentDir,
agentId: route.agentId,
});
if (description) {
// Format the description with sticker context

View File

@@ -7,6 +7,9 @@ const middlewareUseSpy = vi.fn();
const onSpy = vi.fn();
const stopSpy = vi.fn();
const sendChatActionSpy = vi.fn();
const cacheStickerSpy = vi.fn();
const getCachedStickerSpy = vi.fn();
const describeStickerImageSpy = vi.fn();
type ApiStub = {
config: { use: (arg: unknown) => void };
@@ -79,6 +82,12 @@ vi.mock("../config/sessions.js", async (importOriginal) => {
};
});
vi.mock("./sticker-cache.js", () => ({
cacheSticker: (...args: unknown[]) => cacheStickerSpy(...args),
getCachedSticker: (...args: unknown[]) => getCachedStickerSpy(...args),
describeStickerImage: (...args: unknown[]) => describeStickerImageSpy(...args),
}));
vi.mock("./pairing-store.js", () => ({
readTelegramAllowFromStore: vi.fn(async () => [] as string[]),
upsertTelegramPairingRequest: vi.fn(async () => ({
@@ -408,6 +417,12 @@ describe("telegram media groups", () => {
describe("telegram stickers", () => {
const STICKER_TEST_TIMEOUT_MS = process.platform === "win32" ? 30_000 : 20_000;
beforeEach(() => {
cacheStickerSpy.mockReset();
getCachedStickerSpy.mockReset();
describeStickerImageSpy.mockReset();
});
it(
"downloads static sticker (WEBP) and includes sticker metadata",
async () => {
@@ -481,6 +496,88 @@ describe("telegram stickers", () => {
STICKER_TEST_TIMEOUT_MS,
);
it(
"refreshes cached sticker metadata on cache hit",
async () => {
const { createTelegramBot } = await import("./bot.js");
const replyModule = await import("../auto-reply/reply.js");
const replySpy = replyModule.__replySpy as unknown as ReturnType<typeof vi.fn>;
onSpy.mockReset();
replySpy.mockReset();
sendChatActionSpy.mockReset();
getCachedStickerSpy.mockReturnValue({
fileId: "old_file_id",
fileUniqueId: "sticker_unique_456",
emoji: "😴",
setName: "OldSet",
description: "Cached description",
cachedAt: "2026-01-20T10:00:00.000Z",
});
const runtimeError = vi.fn();
createTelegramBot({
token: "tok",
runtime: {
log: vi.fn(),
error: runtimeError,
exit: () => {
throw new Error("exit");
},
},
});
const handler = onSpy.mock.calls.find((call) => call[0] === "message")?.[1] as (
ctx: Record<string, unknown>,
) => Promise<void>;
expect(handler).toBeDefined();
const fetchSpy = vi.spyOn(globalThis, "fetch" as never).mockResolvedValueOnce({
ok: true,
status: 200,
statusText: "OK",
headers: { get: () => "image/webp" },
arrayBuffer: async () => new Uint8Array([0x52, 0x49, 0x46, 0x46]).buffer,
} as Response);
await handler({
message: {
message_id: 103,
chat: { id: 1234, type: "private" },
sticker: {
file_id: "new_file_id",
file_unique_id: "sticker_unique_456",
type: "regular",
width: 512,
height: 512,
is_animated: false,
is_video: false,
emoji: "🔥",
set_name: "NewSet",
},
date: 1736380800,
},
me: { username: "clawdbot_bot" },
getFile: async () => ({ file_path: "stickers/sticker.webp" }),
});
expect(runtimeError).not.toHaveBeenCalled();
expect(cacheStickerSpy).toHaveBeenCalledWith(
expect.objectContaining({
fileId: "new_file_id",
emoji: "🔥",
setName: "NewSet",
}),
);
const payload = replySpy.mock.calls[0][0];
expect(payload.Sticker?.fileId).toBe("new_file_id");
expect(payload.Sticker?.cachedDescription).toBe("Cached description");
fetchSpy.mockRestore();
},
STICKER_TEST_TIMEOUT_MS,
);
it(
"skips animated stickers (TGS format)",
async () => {

View File

@@ -22,7 +22,7 @@ import { buildInlineKeyboard } from "../send.js";
import { resolveTelegramVoiceSend } from "../voice.js";
import { buildTelegramThreadParams, resolveTelegramReplyId } from "./helpers.js";
import type { StickerMetadata, TelegramContext } from "./types.js";
import { getCachedSticker } from "../sticker-cache.js";
import { cacheSticker, getCachedSticker } from "../sticker-cache.js";
const PARSE_ERR_RE = /can't parse entities|parse entities|find end of the entity/i;
const VOICE_FORBIDDEN_RE = /VOICE_MESSAGES_FORBIDDEN/;
@@ -303,14 +303,26 @@ export async function resolveMedia(
const cached = sticker.file_unique_id ? getCachedSticker(sticker.file_unique_id) : null;
if (cached) {
logVerbose(`telegram: sticker cache hit for ${sticker.file_unique_id}`);
const fileId = sticker.file_id ?? cached.fileId;
const emoji = sticker.emoji ?? cached.emoji;
const setName = sticker.set_name ?? cached.setName;
if (fileId !== cached.fileId || emoji !== cached.emoji || setName !== cached.setName) {
// Refresh cached sticker metadata on hits so sends/searches use latest file_id.
cacheSticker({
...cached,
fileId,
emoji,
setName,
});
}
return {
path: saved.path,
contentType: saved.contentType,
placeholder: "<media:sticker>",
stickerMetadata: {
emoji: cached.emoji,
setName: cached.setName,
fileId: cached.fileId,
emoji,
setName,
fileId,
fileUniqueId: sticker.file_unique_id,
cachedDescription: cached.description,
},
@@ -330,7 +342,7 @@ export async function resolveMedia(
},
};
} catch (err) {
logVerbose(`telegram: failed to process sticker: ${err}`);
logVerbose(`telegram: failed to process sticker: ${String(err)}`);
return null;
}
}

View File

@@ -4,7 +4,13 @@ import type { ClawdbotConfig } from "../config/config.js";
import { STATE_DIR_CLAWDBOT } from "../config/paths.js";
import { loadJsonFile, saveJsonFile } from "../infra/json-file.js";
import { logVerbose } from "../globals.js";
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
import {
findModelInCatalog,
loadModelCatalog,
modelSupportsVision,
} from "../agents/model-catalog.js";
import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
import { resolveAutoImageModel } from "../media-understanding/runner.js";
const CACHE_FILE = path.join(STATE_DIR_CLAWDBOT, "telegram", "sticker-cache.json");
const CACHE_VERSION = 1;
@@ -135,18 +141,11 @@ export function getCacheStats(): { count: number; oldestAt?: string; newestAt?:
const STICKER_DESCRIPTION_PROMPT =
"Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective.";
const VISION_PROVIDERS = ["anthropic", "openai", "google", "minimax"] as const;
const DEFAULT_VISION_MODELS: Record<string, string> = {
anthropic: "claude-sonnet-4-20250514",
openai: "gpt-4o-mini",
google: "gemini-2.0-flash",
minimax: "MiniMax-VL-01",
};
export interface DescribeStickerParams {
imagePath: string;
cfg: ClawdbotConfig;
agentDir?: string;
agentId?: string;
}
/**
@@ -155,26 +154,35 @@ export interface DescribeStickerParams {
* Returns null if no vision provider is available.
*/
export async function describeStickerImage(params: DescribeStickerParams): Promise<string | null> {
const { imagePath, cfg, agentDir } = params;
const { imagePath, cfg, agentDir, agentId } = params;
// Find a vision provider with available API key
let provider: string | null = null;
for (const p of VISION_PROVIDERS) {
try {
await resolveApiKeyForProvider({ provider: p, cfg, agentDir });
provider = p;
break;
} catch {
// No key for this provider, try next
const defaultModel = resolveDefaultModelForAgent({ cfg, agentId });
let activeModel = undefined as { provider: string; model: string } | undefined;
try {
const catalog = await loadModelCatalog({ config: cfg });
const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
if (modelSupportsVision(entry)) {
activeModel = { provider: defaultModel.provider, model: defaultModel.model };
}
} catch {
// Ignore catalog failures; fall back to auto selection.
}
if (!provider) {
const resolved = await resolveAutoImageModel({
cfg,
agentDir,
activeModel,
});
if (!resolved) {
logVerbose("telegram: no vision provider available for sticker description");
return null;
}
const model = DEFAULT_VISION_MODELS[provider];
const { provider, model } = resolved;
if (!model) {
logVerbose(`telegram: no vision model available for ${provider}`);
return null;
}
logVerbose(`telegram: describing sticker with ${provider}/${model}`);
try {
@@ -195,7 +203,7 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
});
return result.text;
} catch (err) {
logVerbose(`telegram: failed to describe sticker: ${err}`);
logVerbose(`telegram: failed to describe sticker: ${String(err)}`);
return null;
}
}