mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-24 11:44:28 +00:00
Telegram's auto-linker aggressively treats filenames like HEARTBEAT.md, README.md, main.go, script.py as URLs and generates domain registrar previews. This fix adds comprehensive protection for file extensions that share TLDs: - High priority: .md, .go, .py, .pl, .ai, .sh - Medium priority: .io, .tv, .fm, .am, .at, .be, .cc, .co Implementation: - Added wrapFileReferencesInHtml() in format.ts - Runs AFTER markdown→HTML conversion - Tokenizes HTML to respect tag boundaries - Skips content inside <code>, <pre>, <a> tags (no nesting issues) - Applied to all rendering paths: renderTelegramHtmlText, markdownToTelegramHtml, markdownToTelegramChunks, and delivery.ts fallback Addresses review comments: - P1: Now handles chunked rendering paths correctly - P2: No longer wraps inside existing code blocks (token-based parsing) - No lookbehinds used (broad Node compatibility) Includes comprehensive test suite in format.wrap-md.test.ts AI-assisted: true
213 lines
6.1 KiB
TypeScript
213 lines
6.1 KiB
TypeScript
import type { MarkdownTableMode } from "../config/types.base.js";
|
|
import {
|
|
chunkMarkdownIR,
|
|
markdownToIR,
|
|
type MarkdownLinkSpan,
|
|
type MarkdownIR,
|
|
} from "../markdown/ir.js";
|
|
import { renderMarkdownWithMarkers } from "../markdown/render.js";
|
|
|
|
export type TelegramFormattedChunk = {
|
|
html: string;
|
|
text: string;
|
|
};
|
|
|
|
function escapeHtml(text: string): string {
|
|
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
}
|
|
|
|
function escapeHtmlAttr(text: string): string {
|
|
return escapeHtml(text).replace(/"/g, """);
|
|
}
|
|
|
|
function buildTelegramLink(link: MarkdownLinkSpan, _text: string) {
|
|
const href = link.href.trim();
|
|
if (!href) {
|
|
return null;
|
|
}
|
|
if (link.start === link.end) {
|
|
return null;
|
|
}
|
|
const safeHref = escapeHtmlAttr(href);
|
|
return {
|
|
start: link.start,
|
|
end: link.end,
|
|
open: `<a href="${safeHref}">`,
|
|
close: "</a>",
|
|
};
|
|
}
|
|
|
|
function renderTelegramHtml(ir: MarkdownIR): string {
|
|
return renderMarkdownWithMarkers(ir, {
|
|
styleMarkers: {
|
|
bold: { open: "<b>", close: "</b>" },
|
|
italic: { open: "<i>", close: "</i>" },
|
|
strikethrough: { open: "<s>", close: "</s>" },
|
|
code: { open: "<code>", close: "</code>" },
|
|
code_block: { open: "<pre><code>", close: "</code></pre>" },
|
|
},
|
|
escapeText: escapeHtml,
|
|
buildLink: buildTelegramLink,
|
|
});
|
|
}
|
|
|
|
export function markdownToTelegramHtml(
|
|
markdown: string,
|
|
options: { tableMode?: MarkdownTableMode; wrapFileRefs?: boolean } = {},
|
|
): string {
|
|
const ir = markdownToIR(markdown ?? "", {
|
|
linkify: true,
|
|
headingStyle: "none",
|
|
blockquotePrefix: "",
|
|
tableMode: options.tableMode,
|
|
});
|
|
const html = renderTelegramHtml(ir);
|
|
// Apply file reference wrapping if requested (for chunked rendering)
|
|
if (options.wrapFileRefs !== false) {
|
|
return wrapFileReferencesInHtml(html);
|
|
}
|
|
return html;
|
|
}
|
|
|
|
/**
|
|
* File extensions that share TLDs and commonly appear in code/documentation.
|
|
* These are wrapped in <code> tags to prevent Telegram from generating
|
|
* spurious domain registrar previews.
|
|
*/
|
|
const FILE_EXTENSIONS_WITH_TLD = new Set([
|
|
// High priority - commonly referenced in messages
|
|
"md", // Markdown (Moldova)
|
|
"go", // Go language
|
|
"py", // Python (Paraguay)
|
|
"pl", // Perl (Poland)
|
|
"ai", // Adobe Illustrator (Anguilla)
|
|
"sh", // Shell (Saint Helena)
|
|
// Medium priority - sometimes referenced
|
|
"io", // Tuvalu (often used for tech projects)
|
|
"tv", // Tuvalu (video files)
|
|
"fm", // Federated States of Micronesia (audio)
|
|
"am", // Armenia
|
|
"at", // Austria
|
|
"be", // Belgium
|
|
"cc", // Cocos Islands
|
|
"co", // Colombia
|
|
]);
|
|
|
|
/**
|
|
* Wraps standalone file references (with TLD extensions) in <code> tags.
|
|
* This prevents Telegram from treating them as URLs and generating
|
|
* irrelevant domain registrar previews.
|
|
*
|
|
* Runs AFTER markdown→HTML conversion to avoid modifying HTML attributes.
|
|
* Skips content inside <code>, <pre>, and <a> tags to avoid nesting issues.
|
|
*/
|
|
export function wrapFileReferencesInHtml(html: string): string {
|
|
// Build regex pattern for all tracked extensions
|
|
const extensionsPattern = Array.from(FILE_EXTENSIONS_WITH_TLD).join("|");
|
|
const filePattern = new RegExp(
|
|
`(^|>|[\\s])([a-zA-Z0-9_.\\-./]+\\.(?:${extensionsPattern}))(?=$|[\\s<])`,
|
|
"gi",
|
|
);
|
|
|
|
// Track if we're inside tags that should not be modified
|
|
let inCode = false;
|
|
let inPre = false;
|
|
let inAnchor = false;
|
|
let result = "";
|
|
let lastIndex = 0;
|
|
|
|
// Process the HTML token by token to respect tag boundaries
|
|
const tagPattern = /(<\/?)(code|pre|a)\b[^>]*?>/gi;
|
|
let match: RegExpExecArray | null;
|
|
|
|
while ((match = tagPattern.exec(html)) !== null) {
|
|
const tagStart = match.index;
|
|
const tagEnd = tagPattern.lastIndex;
|
|
const isClosing = match[1] === "/";
|
|
const tagName = match[2].toLowerCase();
|
|
|
|
// Process text before this tag
|
|
const textBefore = html.slice(lastIndex, tagStart);
|
|
result += textBefore.replace(filePattern, (m, prefix, filename) => {
|
|
// Skip if inside protected tags or if it's a URL
|
|
if (inCode || inPre || inAnchor) {
|
|
return m;
|
|
}
|
|
if (filename.startsWith("//")) {
|
|
return m;
|
|
}
|
|
if (/https?:\/\/$/i.test(prefix)) {
|
|
return m;
|
|
}
|
|
return `${prefix}<code>${filename}</code>`;
|
|
});
|
|
|
|
// Update tag state
|
|
if (tagName === "code") {
|
|
inCode = !isClosing;
|
|
} else if (tagName === "pre") {
|
|
inPre = !isClosing;
|
|
} else if (tagName === "a") {
|
|
inAnchor = !isClosing;
|
|
}
|
|
|
|
// Add the tag itself
|
|
result += html.slice(tagStart, tagEnd);
|
|
lastIndex = tagEnd;
|
|
}
|
|
|
|
// Process remaining text
|
|
const remainingText = html.slice(lastIndex);
|
|
result += remainingText.replace(filePattern, (m, prefix, filename) => {
|
|
if (inCode || inPre || inAnchor) {
|
|
return m;
|
|
}
|
|
if (filename.startsWith("//")) {
|
|
return m;
|
|
}
|
|
if (/https?:\/\/$/i.test(prefix)) {
|
|
return m;
|
|
}
|
|
return `${prefix}<code>${filename}</code>`;
|
|
});
|
|
|
|
return result;
|
|
}
|
|
|
|
export function renderTelegramHtmlText(
|
|
text: string,
|
|
options: { textMode?: "markdown" | "html"; tableMode?: MarkdownTableMode } = {},
|
|
): string {
|
|
const textMode = options.textMode ?? "markdown";
|
|
if (textMode === "html") {
|
|
// For HTML mode, still wrap file references in the HTML
|
|
return wrapFileReferencesInHtml(text);
|
|
}
|
|
const html = markdownToTelegramHtml(text, { tableMode: options.tableMode });
|
|
// Wrap file references after markdown→HTML conversion
|
|
// This ensures we only transform text nodes, not HTML attributes
|
|
return wrapFileReferencesInHtml(html);
|
|
}
|
|
|
|
export function markdownToTelegramChunks(
|
|
markdown: string,
|
|
limit: number,
|
|
options: { tableMode?: MarkdownTableMode } = {},
|
|
): TelegramFormattedChunk[] {
|
|
const ir = markdownToIR(markdown ?? "", {
|
|
linkify: true,
|
|
headingStyle: "none",
|
|
blockquotePrefix: "",
|
|
tableMode: options.tableMode,
|
|
});
|
|
const chunks = chunkMarkdownIR(ir, limit);
|
|
return chunks.map((chunk) => ({
|
|
html: wrapFileReferencesInHtml(renderTelegramHtml(chunk)),
|
|
text: chunk.text,
|
|
}));
|
|
}
|
|
|
|
export function markdownToTelegramHtmlChunks(markdown: string, limit: number): string[] {
|
|
return markdownToTelegramChunks(markdown, limit).map((chunk) => chunk.html);
|
|
}
|