mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-24 11:04:27 +00:00
fix(telegram): auto-wrap file references with TLD extensions to prevent URL previews
Telegram's auto-linker aggressively treats filenames like HEARTBEAT.md, README.md, main.go, script.py as URLs and generates domain registrar previews. This fix adds comprehensive protection for file extensions that share TLDs: - High priority: .md, .go, .py, .pl, .ai, .sh - Medium priority: .io, .tv, .fm, .am, .at, .be, .cc, .co Implementation: - Added wrapFileReferencesInHtml() in format.ts - Runs AFTER markdown→HTML conversion - Tokenizes HTML to respect tag boundaries - Skips content inside <code>, <pre>, <a> tags (no nesting issues) - Applied to all rendering paths: renderTelegramHtmlText, markdownToTelegramHtml, markdownToTelegramChunks, and delivery.ts fallback Addresses review comments: - P1: Now handles chunked rendering paths correctly - P2: No longer wraps inside existing code blocks (token-based parsing) - No lookbehinds used (broad Node compatibility) Includes comprehensive test suite in format.wrap-md.test.ts AI-assisted: true
This commit is contained in:
@@ -18,6 +18,7 @@ import {
|
||||
markdownToTelegramChunks,
|
||||
markdownToTelegramHtml,
|
||||
renderTelegramHtmlText,
|
||||
wrapFileReferencesInHtml,
|
||||
} from "../format.js";
|
||||
import { buildInlineKeyboard } from "../send.js";
|
||||
import { cacheSticker, getCachedSticker } from "../sticker-cache.js";
|
||||
@@ -76,7 +77,9 @@ export async function deliverReplies(params: {
|
||||
const nested = markdownToTelegramChunks(chunk, textLimit, { tableMode: params.tableMode });
|
||||
if (!nested.length && chunk) {
|
||||
chunks.push({
|
||||
html: markdownToTelegramHtml(chunk, { tableMode: params.tableMode }),
|
||||
html: wrapFileReferencesInHtml(
|
||||
markdownToTelegramHtml(chunk, { tableMode: params.tableMode, wrapFileRefs: false }),
|
||||
),
|
||||
text: chunk,
|
||||
});
|
||||
continue;
|
||||
|
||||
@@ -53,7 +53,7 @@ function renderTelegramHtml(ir: MarkdownIR): string {
|
||||
|
||||
export function markdownToTelegramHtml(
|
||||
markdown: string,
|
||||
options: { tableMode?: MarkdownTableMode } = {},
|
||||
options: { tableMode?: MarkdownTableMode; wrapFileRefs?: boolean } = {},
|
||||
): string {
|
||||
const ir = markdownToIR(markdown ?? "", {
|
||||
linkify: true,
|
||||
@@ -61,7 +61,117 @@ export function markdownToTelegramHtml(
|
||||
blockquotePrefix: "",
|
||||
tableMode: options.tableMode,
|
||||
});
|
||||
return renderTelegramHtml(ir);
|
||||
const html = renderTelegramHtml(ir);
|
||||
// Apply file reference wrapping if requested (for chunked rendering)
|
||||
if (options.wrapFileRefs !== false) {
|
||||
return wrapFileReferencesInHtml(html);
|
||||
}
|
||||
return html;
|
||||
}
|
||||
|
||||
/**
|
||||
* File extensions that share TLDs and commonly appear in code/documentation.
|
||||
* These are wrapped in <code> tags to prevent Telegram from generating
|
||||
* spurious domain registrar previews.
|
||||
*/
|
||||
const FILE_EXTENSIONS_WITH_TLD = new Set([
|
||||
// High priority - commonly referenced in messages
|
||||
"md", // Markdown (Moldova)
|
||||
"go", // Go language
|
||||
"py", // Python (Paraguay)
|
||||
"pl", // Perl (Poland)
|
||||
"ai", // Adobe Illustrator (Anguilla)
|
||||
"sh", // Shell (Saint Helena)
|
||||
// Medium priority - sometimes referenced
|
||||
"io", // Tuvalu (often used for tech projects)
|
||||
"tv", // Tuvalu (video files)
|
||||
"fm", // Federated States of Micronesia (audio)
|
||||
"am", // Armenia
|
||||
"at", // Austria
|
||||
"be", // Belgium
|
||||
"cc", // Cocos Islands
|
||||
"co", // Colombia
|
||||
]);
|
||||
|
||||
/**
|
||||
* Wraps standalone file references (with TLD extensions) in <code> tags.
|
||||
* This prevents Telegram from treating them as URLs and generating
|
||||
* irrelevant domain registrar previews.
|
||||
*
|
||||
* Runs AFTER markdown→HTML conversion to avoid modifying HTML attributes.
|
||||
* Skips content inside <code>, <pre>, and <a> tags to avoid nesting issues.
|
||||
*/
|
||||
export function wrapFileReferencesInHtml(html: string): string {
|
||||
// Build regex pattern for all tracked extensions
|
||||
const extensionsPattern = Array.from(FILE_EXTENSIONS_WITH_TLD).join("|");
|
||||
const filePattern = new RegExp(
|
||||
`(^|>|[\\s])([a-zA-Z0-9_.\\-./]+\\.(?:${extensionsPattern}))(?=$|[\\s<])`,
|
||||
"gi",
|
||||
);
|
||||
|
||||
// Track if we're inside tags that should not be modified
|
||||
let inCode = false;
|
||||
let inPre = false;
|
||||
let inAnchor = false;
|
||||
let result = "";
|
||||
let lastIndex = 0;
|
||||
|
||||
// Process the HTML token by token to respect tag boundaries
|
||||
const tagPattern = /(<\/?)(code|pre|a)\b[^>]*?>/gi;
|
||||
let match: RegExpExecArray | null;
|
||||
|
||||
while ((match = tagPattern.exec(html)) !== null) {
|
||||
const tagStart = match.index;
|
||||
const tagEnd = tagPattern.lastIndex;
|
||||
const isClosing = match[1] === "/";
|
||||
const tagName = match[2].toLowerCase();
|
||||
|
||||
// Process text before this tag
|
||||
const textBefore = html.slice(lastIndex, tagStart);
|
||||
result += textBefore.replace(filePattern, (m, prefix, filename) => {
|
||||
// Skip if inside protected tags or if it's a URL
|
||||
if (inCode || inPre || inAnchor) {
|
||||
return m;
|
||||
}
|
||||
if (filename.startsWith("//")) {
|
||||
return m;
|
||||
}
|
||||
if (/https?:\/\/$/i.test(prefix)) {
|
||||
return m;
|
||||
}
|
||||
return `${prefix}<code>${filename}</code>`;
|
||||
});
|
||||
|
||||
// Update tag state
|
||||
if (tagName === "code") {
|
||||
inCode = !isClosing;
|
||||
} else if (tagName === "pre") {
|
||||
inPre = !isClosing;
|
||||
} else if (tagName === "a") {
|
||||
inAnchor = !isClosing;
|
||||
}
|
||||
|
||||
// Add the tag itself
|
||||
result += html.slice(tagStart, tagEnd);
|
||||
lastIndex = tagEnd;
|
||||
}
|
||||
|
||||
// Process remaining text
|
||||
const remainingText = html.slice(lastIndex);
|
||||
result += remainingText.replace(filePattern, (m, prefix, filename) => {
|
||||
if (inCode || inPre || inAnchor) {
|
||||
return m;
|
||||
}
|
||||
if (filename.startsWith("//")) {
|
||||
return m;
|
||||
}
|
||||
if (/https?:\/\/$/i.test(prefix)) {
|
||||
return m;
|
||||
}
|
||||
return `${prefix}<code>${filename}</code>`;
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
export function renderTelegramHtmlText(
|
||||
@@ -70,9 +180,13 @@ export function renderTelegramHtmlText(
|
||||
): string {
|
||||
const textMode = options.textMode ?? "markdown";
|
||||
if (textMode === "html") {
|
||||
return text;
|
||||
// For HTML mode, still wrap file references in the HTML
|
||||
return wrapFileReferencesInHtml(text);
|
||||
}
|
||||
return markdownToTelegramHtml(text, { tableMode: options.tableMode });
|
||||
const html = markdownToTelegramHtml(text, { tableMode: options.tableMode });
|
||||
// Wrap file references after markdown→HTML conversion
|
||||
// This ensures we only transform text nodes, not HTML attributes
|
||||
return wrapFileReferencesInHtml(html);
|
||||
}
|
||||
|
||||
export function markdownToTelegramChunks(
|
||||
@@ -88,7 +202,7 @@ export function markdownToTelegramChunks(
|
||||
});
|
||||
const chunks = chunkMarkdownIR(ir, limit);
|
||||
return chunks.map((chunk) => ({
|
||||
html: renderTelegramHtml(chunk),
|
||||
html: wrapFileReferencesInHtml(renderTelegramHtml(chunk)),
|
||||
text: chunk.text,
|
||||
}));
|
||||
}
|
||||
|
||||
101
src/telegram/format.wrap-md.test.ts
Normal file
101
src/telegram/format.wrap-md.test.ts
Normal file
@@ -0,0 +1,101 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
markdownToTelegramHtml,
|
||||
renderTelegramHtmlText,
|
||||
wrapFileReferencesInHtml,
|
||||
} from "./format.js";
|
||||
|
||||
describe("wrapFileReferencesInHtml", () => {
|
||||
it("wraps .md filenames in code tags", () => {
|
||||
expect(wrapFileReferencesInHtml("Check README.md")).toContain("Check <code>README.md</code>");
|
||||
expect(wrapFileReferencesInHtml("See HEARTBEAT.md for status")).toContain(
|
||||
"See <code>HEARTBEAT.md</code> for status",
|
||||
);
|
||||
});
|
||||
|
||||
it("wraps .go filenames", () => {
|
||||
expect(wrapFileReferencesInHtml("Check main.go")).toContain("Check <code>main.go</code>");
|
||||
});
|
||||
|
||||
it("wraps .py filenames", () => {
|
||||
expect(wrapFileReferencesInHtml("Run script.py")).toContain("Run <code>script.py</code>");
|
||||
});
|
||||
|
||||
it("wraps .pl filenames", () => {
|
||||
expect(wrapFileReferencesInHtml("Check backup.pl")).toContain("Check <code>backup.pl</code>");
|
||||
});
|
||||
|
||||
it("wraps file paths", () => {
|
||||
expect(wrapFileReferencesInHtml("Look at squad/friday/HEARTBEAT.md")).toContain(
|
||||
"Look at <code>squad/friday/HEARTBEAT.md</code>",
|
||||
);
|
||||
});
|
||||
|
||||
it("does not wrap inside existing code tags", () => {
|
||||
const input = "Already <code>wrapped.md</code> here";
|
||||
const result = wrapFileReferencesInHtml(input);
|
||||
expect(result).toBe(input);
|
||||
expect(result).not.toContain("<code><code>");
|
||||
});
|
||||
|
||||
it("does not wrap inside pre tags", () => {
|
||||
const input = "<pre><code>README.md</code></pre>";
|
||||
const result = wrapFileReferencesInHtml(input);
|
||||
expect(result).toBe(input);
|
||||
});
|
||||
|
||||
it("does not wrap inside anchor tags", () => {
|
||||
const input = '<a href="README.md">Link</a>';
|
||||
const result = wrapFileReferencesInHtml(input);
|
||||
expect(result).toBe(input);
|
||||
});
|
||||
|
||||
it("does not wrap in URLs", () => {
|
||||
const result = wrapFileReferencesInHtml("Visit https://example.com/README.md");
|
||||
expect(result).toContain('href="https://example.com/README.md"');
|
||||
expect(result).not.toContain("<code>README.md</code>");
|
||||
});
|
||||
|
||||
it("handles mixed content correctly", () => {
|
||||
const result = wrapFileReferencesInHtml("Check README.md and CONTRIBUTING.md");
|
||||
expect(result).toContain("<code>README.md</code>");
|
||||
expect(result).toContain("<code>CONTRIBUTING.md</code>");
|
||||
});
|
||||
|
||||
it("handles edge cases", () => {
|
||||
expect(wrapFileReferencesInHtml("No markdown files here")).not.toContain("<code>");
|
||||
expect(wrapFileReferencesInHtml("File.md at start")).toContain("<code>File.md</code>");
|
||||
expect(wrapFileReferencesInHtml("Ends with file.md")).toContain("<code>file.md</code>");
|
||||
});
|
||||
});
|
||||
|
||||
describe("renderTelegramHtmlText - file reference wrapping", () => {
|
||||
it("wraps file references in markdown mode", () => {
|
||||
const result = renderTelegramHtmlText("Check README.md");
|
||||
expect(result).toContain("<code>README.md</code>");
|
||||
});
|
||||
|
||||
it("wraps file references in HTML mode", () => {
|
||||
const result = renderTelegramHtmlText("Check README.md", { textMode: "html" });
|
||||
expect(result).toContain("<code>README.md</code>");
|
||||
});
|
||||
|
||||
it("does not double-wrap already code-formatted content", () => {
|
||||
const result = renderTelegramHtmlText("Already `wrapped.md` here");
|
||||
// Should have code tags but not nested
|
||||
expect(result).toContain("<code>");
|
||||
expect(result).not.toContain("<code><code>");
|
||||
});
|
||||
});
|
||||
|
||||
describe("markdownToTelegramHtml - file reference wrapping", () => {
|
||||
it("wraps file references by default", () => {
|
||||
const result = markdownToTelegramHtml("Check README.md");
|
||||
expect(result).toContain("<code>README.md</code>");
|
||||
});
|
||||
|
||||
it("can skip wrapping when requested", () => {
|
||||
const result = markdownToTelegramHtml("Check README.md", { wrapFileRefs: false });
|
||||
expect(result).not.toContain("<code>README.md</code>");
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user