Files
openclaw/src/telegram/format.ts
divanoli 90974e1030 refactor(telegram): use snapshot for orphaned TLD offset clarity
Use explicit snapshot variable when checking tag positions in orphaned
TLD pass. While JavaScript's replace() doesn't mutate during iteration,
this makes intent explicit and adds test coverage for multi-TLD HTML.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 12:41:08 +03:00

279 lines
8.7 KiB
TypeScript

import type { MarkdownTableMode } from "../config/types.base.js";
import {
chunkMarkdownIR,
markdownToIR,
type MarkdownLinkSpan,
type MarkdownIR,
} from "../markdown/ir.js";
import { renderMarkdownWithMarkers } from "../markdown/render.js";
export type TelegramFormattedChunk = {
html: string;
text: string;
};
function escapeHtml(text: string): string {
return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
}
function escapeHtmlAttr(text: string): string {
return escapeHtml(text).replace(/"/g, "&quot;");
}
/**
* File extensions that share TLDs and commonly appear in code/documentation.
* These are wrapped in <code> tags to prevent Telegram from generating
* spurious domain registrar previews.
*
* Only includes extensions that are:
* 1. Commonly used as file extensions in code/docs
* 2. Rarely used as intentional domain references
*
* Excluded: .ai, .io, .tv, .fm (popular domain TLDs like x.ai, vercel.io, github.io)
*/
const FILE_EXTENSIONS_WITH_TLD = new Set([
"md", // Markdown (Moldova) - very common in repos
"go", // Go language - common in Go projects
"py", // Python (Paraguay) - common in Python projects
"pl", // Perl (Poland) - common in Perl projects
"sh", // Shell (Saint Helena) - common for scripts
"am", // Automake files (Armenia)
"at", // Assembly (Austria)
"be", // Backend files (Belgium)
"cc", // C++ source (Cocos Islands)
"co", // Configuration (Colombia)
]);
/** Detects when markdown-it linkify auto-generated a link from a bare filename (e.g. README.md → http://README.md) */
function isAutoLinkedFileRef(href: string, label: string): boolean {
const stripped = href.replace(/^https?:\/\//i, "");
if (stripped !== label) {
return false;
}
const dotIndex = label.lastIndexOf(".");
if (dotIndex < 1) {
return false;
}
const ext = label.slice(dotIndex + 1).toLowerCase();
if (!FILE_EXTENSIONS_WITH_TLD.has(ext)) {
return false;
}
// Reject if any path segment before the filename contains a dot (looks like a domain)
const segments = label.split("/");
if (segments.length > 1) {
for (let i = 0; i < segments.length - 1; i++) {
if (segments[i].includes(".")) {
return false;
}
}
}
return true;
}
function buildTelegramLink(link: MarkdownLinkSpan, text: string) {
const href = link.href.trim();
if (!href) {
return null;
}
if (link.start === link.end) {
return null;
}
// Suppress auto-linkified file references (e.g. README.md → http://README.md)
const label = text.slice(link.start, link.end);
if (isAutoLinkedFileRef(href, label)) {
return null;
}
const safeHref = escapeHtmlAttr(href);
return {
start: link.start,
end: link.end,
open: `<a href="${safeHref}">`,
close: "</a>",
};
}
function renderTelegramHtml(ir: MarkdownIR): string {
return renderMarkdownWithMarkers(ir, {
styleMarkers: {
bold: { open: "<b>", close: "</b>" },
italic: { open: "<i>", close: "</i>" },
strikethrough: { open: "<s>", close: "</s>" },
code: { open: "<code>", close: "</code>" },
code_block: { open: "<pre><code>", close: "</code></pre>" },
},
escapeText: escapeHtml,
buildLink: buildTelegramLink,
});
}
export function markdownToTelegramHtml(
markdown: string,
options: { tableMode?: MarkdownTableMode; wrapFileRefs?: boolean } = {},
): string {
const ir = markdownToIR(markdown ?? "", {
linkify: true,
headingStyle: "none",
blockquotePrefix: "",
tableMode: options.tableMode,
});
const html = renderTelegramHtml(ir);
// Apply file reference wrapping if requested (for chunked rendering)
if (options.wrapFileRefs !== false) {
return wrapFileReferencesInHtml(html);
}
return html;
}
/**
* Wraps standalone file references (with TLD extensions) in <code> tags.
* This prevents Telegram from treating them as URLs and generating
* irrelevant domain registrar previews.
*
* Runs AFTER markdown→HTML conversion to avoid modifying HTML attributes.
* Skips content inside <code>, <pre>, and <a> tags to avoid nesting issues.
*/
/** Escape regex metacharacters in a string */
function escapeRegex(str: string): string {
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
export function wrapFileReferencesInHtml(html: string): string {
// Build regex pattern for all tracked extensions (escape metacharacters for safety)
const extensionsPattern = Array.from(FILE_EXTENSIONS_WITH_TLD).map(escapeRegex).join("|");
// Safety-net: de-linkify auto-generated anchors where href="http://<label>" (defense in depth for textMode: "html")
const autoLinkedAnchor = /<a\s+href="https?:\/\/([^"]+)"[^>]*>\1<\/a>/gi;
html = html.replace(autoLinkedAnchor, (_match, label: string) => {
if (!isAutoLinkedFileRef(`http://${label}`, label)) {
return _match;
}
return `<code>${escapeHtml(label)}</code>`;
});
const filePattern = new RegExp(
`(^|>|[\\s])([a-zA-Z0-9_.\\-./]+\\.(?:${extensionsPattern}))(?=$|[\\s<])`,
"gi",
);
// Track nesting depth for tags that should not be modified
let codeDepth = 0;
let preDepth = 0;
let anchorDepth = 0;
let result = "";
let lastIndex = 0;
// Process the HTML token by token to respect tag boundaries
const tagPattern = /(<\/?)(code|pre|a)\b[^>]*?>/gi;
let match: RegExpExecArray | null;
while ((match = tagPattern.exec(html)) !== null) {
const tagStart = match.index;
const tagEnd = tagPattern.lastIndex;
const isClosing = match[1] === "</";
const tagName = match[2].toLowerCase();
// Process text before this tag
const textBefore = html.slice(lastIndex, tagStart);
result += textBefore.replace(filePattern, (m, prefix, filename) => {
// Skip if inside protected tags or if it's a URL
if (codeDepth > 0 || preDepth > 0 || anchorDepth > 0) {
return m;
}
if (filename.startsWith("//")) {
return m;
}
if (/https?:\/\/$/i.test(prefix)) {
return m;
}
return `${prefix}<code>${escapeHtml(filename)}</code>`;
});
// Update tag depth
if (tagName === "code") {
codeDepth += isClosing ? -1 : 1;
} else if (tagName === "pre") {
preDepth += isClosing ? -1 : 1;
} else if (tagName === "a") {
anchorDepth += isClosing ? -1 : 1;
}
// Add the tag itself
result += html.slice(tagStart, tagEnd);
lastIndex = tagEnd;
}
// Process remaining text
const remainingText = html.slice(lastIndex);
result += remainingText.replace(filePattern, (m, prefix, filename) => {
if (codeDepth > 0 || preDepth > 0 || anchorDepth > 0) {
return m;
}
if (filename.startsWith("//")) {
return m;
}
if (/https?:\/\/$/i.test(prefix)) {
return m;
}
return `${prefix}<code>${escapeHtml(filename)}</code>`;
});
// Second pass: catch orphaned single-letter TLD patterns (e.g., 'D.md' in 'R&D.md')
// These can be auto-linked by Telegram as domains
const orphanedTldPattern = new RegExp(
`([^a-zA-Z0-9]|^)([A-Za-z]\\.(?:${extensionsPattern}))(?=[^a-zA-Z0-9/]|$)`,
"g",
);
// Snapshot for offset calculations (offset is relative to pre-replacement string)
// Note: replace() doesn't mutate, but snapshot makes intent explicit
const snapshot = result;
result = snapshot.replace(orphanedTldPattern, (m, prefix, tld, offset) => {
// Skip if prefix is > (right after a tag close)
if (prefix === ">") {
return m;
}
// Skip if we're inside an HTML tag (between < and >)
const lastOpen = snapshot.lastIndexOf("<", offset);
const lastClose = snapshot.lastIndexOf(">", offset);
if (lastOpen > lastClose) {
return m; // Inside a tag
}
return `${prefix}<code>${escapeHtml(tld)}</code>`;
});
return result;
}
export function renderTelegramHtmlText(
text: string,
options: { textMode?: "markdown" | "html"; tableMode?: MarkdownTableMode } = {},
): string {
const textMode = options.textMode ?? "markdown";
if (textMode === "html") {
// For HTML mode, trust caller markup - don't modify
return text;
}
// markdownToTelegramHtml already wraps file references by default
return markdownToTelegramHtml(text, { tableMode: options.tableMode });
}
export function markdownToTelegramChunks(
markdown: string,
limit: number,
options: { tableMode?: MarkdownTableMode } = {},
): TelegramFormattedChunk[] {
const ir = markdownToIR(markdown ?? "", {
linkify: true,
headingStyle: "none",
blockquotePrefix: "",
tableMode: options.tableMode,
});
const chunks = chunkMarkdownIR(ir, limit);
return chunks.map((chunk) => ({
html: wrapFileReferencesInHtml(renderTelegramHtml(chunk)),
text: chunk.text,
}));
}
export function markdownToTelegramHtmlChunks(markdown: string, limit: number): string[] {
return markdownToTelegramChunks(markdown, limit).map((chunk) => chunk.html);
}