From 70f73e6f8dfaac9530fd6d5cdddc8b3db02ced96 Mon Sep 17 00:00:00 2001 From: divanoli Date: Wed, 4 Feb 2026 10:09:37 +0300 Subject: [PATCH] fix(telegram): auto-wrap file references with TLD extensions to prevent URL previews MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Telegram's auto-linker aggressively treats filenames like HEARTBEAT.md, README.md, main.go, script.py as URLs and generates domain registrar previews. This fix adds comprehensive protection for file extensions that share TLDs: - High priority: .md, .go, .py, .pl, .ai, .sh - Medium priority: .io, .tv, .fm, .am, .at, .be, .cc, .co Implementation: - Added wrapFileReferencesInHtml() in format.ts - Runs AFTER markdown→HTML conversion - Tokenizes HTML to respect tag boundaries - Skips content inside ,
,  tags (no nesting issues)
- Applied to all rendering paths: renderTelegramHtmlText, markdownToTelegramHtml,
  markdownToTelegramChunks, and delivery.ts fallback

Addresses review comments:
- P1: Now handles chunked rendering paths correctly
- P2: No longer wraps inside existing code blocks (token-based parsing)
- No lookbehinds used (broad Node compatibility)

Includes comprehensive test suite in format.wrap-md.test.ts

AI-assisted: true
---
 src/telegram/bot/delivery.ts        |   5 +-
 src/telegram/format.ts              | 124 ++++++++++++++++++++++++++--
 src/telegram/format.wrap-md.test.ts | 101 ++++++++++++++++++++++
 3 files changed, 224 insertions(+), 6 deletions(-)
 create mode 100644 src/telegram/format.wrap-md.test.ts

diff --git a/src/telegram/bot/delivery.ts b/src/telegram/bot/delivery.ts
index f5eca9bfa56..36bfabc2a6d 100644
--- a/src/telegram/bot/delivery.ts
+++ b/src/telegram/bot/delivery.ts
@@ -18,6 +18,7 @@ import {
   markdownToTelegramChunks,
   markdownToTelegramHtml,
   renderTelegramHtmlText,
+  wrapFileReferencesInHtml,
 } from "../format.js";
 import { buildInlineKeyboard } from "../send.js";
 import { cacheSticker, getCachedSticker } from "../sticker-cache.js";
@@ -76,7 +77,9 @@ export async function deliverReplies(params: {
       const nested = markdownToTelegramChunks(chunk, textLimit, { tableMode: params.tableMode });
       if (!nested.length && chunk) {
         chunks.push({
-          html: markdownToTelegramHtml(chunk, { tableMode: params.tableMode }),
+          html: wrapFileReferencesInHtml(
+            markdownToTelegramHtml(chunk, { tableMode: params.tableMode, wrapFileRefs: false }),
+          ),
           text: chunk,
         });
         continue;
diff --git a/src/telegram/format.ts b/src/telegram/format.ts
index e3d7e4c4301..f82921fa5fb 100644
--- a/src/telegram/format.ts
+++ b/src/telegram/format.ts
@@ -53,7 +53,7 @@ function renderTelegramHtml(ir: MarkdownIR): string {
 
 export function markdownToTelegramHtml(
   markdown: string,
-  options: { tableMode?: MarkdownTableMode } = {},
+  options: { tableMode?: MarkdownTableMode; wrapFileRefs?: boolean } = {},
 ): string {
   const ir = markdownToIR(markdown ?? "", {
     linkify: true,
@@ -61,7 +61,117 @@ export function markdownToTelegramHtml(
     blockquotePrefix: "",
     tableMode: options.tableMode,
   });
-  return renderTelegramHtml(ir);
+  const html = renderTelegramHtml(ir);
+  // Apply file reference wrapping if requested (for chunked rendering)
+  if (options.wrapFileRefs !== false) {
+    return wrapFileReferencesInHtml(html);
+  }
+  return html;
+}
+
+/**
+ * File extensions that share TLDs and commonly appear in code/documentation.
+ * These are wrapped in  tags to prevent Telegram from generating
+ * spurious domain registrar previews.
+ */
+const FILE_EXTENSIONS_WITH_TLD = new Set([
+  // High priority - commonly referenced in messages
+  "md", // Markdown (Moldova)
+  "go", // Go language
+  "py", // Python (Paraguay)
+  "pl", // Perl (Poland)
+  "ai", // Adobe Illustrator (Anguilla)
+  "sh", // Shell (Saint Helena)
+  // Medium priority - sometimes referenced
+  "io", // Tuvalu (often used for tech projects)
+  "tv", // Tuvalu (video files)
+  "fm", // Federated States of Micronesia (audio)
+  "am", // Armenia
+  "at", // Austria
+  "be", // Belgium
+  "cc", // Cocos Islands
+  "co", // Colombia
+]);
+
+/**
+ * Wraps standalone file references (with TLD extensions) in  tags.
+ * This prevents Telegram from treating them as URLs and generating
+ * irrelevant domain registrar previews.
+ *
+ * Runs AFTER markdown→HTML conversion to avoid modifying HTML attributes.
+ * Skips content inside , 
, and  tags to avoid nesting issues.
+ */
+export function wrapFileReferencesInHtml(html: string): string {
+  // Build regex pattern for all tracked extensions
+  const extensionsPattern = Array.from(FILE_EXTENSIONS_WITH_TLD).join("|");
+  const filePattern = new RegExp(
+    `(^|>|[\\s])([a-zA-Z0-9_.\\-./]+\\.(?:${extensionsPattern}))(?=$|[\\s<])`,
+    "gi",
+  );
+
+  // Track if we're inside tags that should not be modified
+  let inCode = false;
+  let inPre = false;
+  let inAnchor = false;
+  let result = "";
+  let lastIndex = 0;
+
+  // Process the HTML token by token to respect tag boundaries
+  const tagPattern = /(<\/?)(code|pre|a)\b[^>]*?>/gi;
+  let match: RegExpExecArray | null;
+
+  while ((match = tagPattern.exec(html)) !== null) {
+    const tagStart = match.index;
+    const tagEnd = tagPattern.lastIndex;
+    const isClosing = match[1] === "/";
+    const tagName = match[2].toLowerCase();
+
+    // Process text before this tag
+    const textBefore = html.slice(lastIndex, tagStart);
+    result += textBefore.replace(filePattern, (m, prefix, filename) => {
+      // Skip if inside protected tags or if it's a URL
+      if (inCode || inPre || inAnchor) {
+        return m;
+      }
+      if (filename.startsWith("//")) {
+        return m;
+      }
+      if (/https?:\/\/$/i.test(prefix)) {
+        return m;
+      }
+      return `${prefix}${filename}`;
+    });
+
+    // Update tag state
+    if (tagName === "code") {
+      inCode = !isClosing;
+    } else if (tagName === "pre") {
+      inPre = !isClosing;
+    } else if (tagName === "a") {
+      inAnchor = !isClosing;
+    }
+
+    // Add the tag itself
+    result += html.slice(tagStart, tagEnd);
+    lastIndex = tagEnd;
+  }
+
+  // Process remaining text
+  const remainingText = html.slice(lastIndex);
+  result += remainingText.replace(filePattern, (m, prefix, filename) => {
+    if (inCode || inPre || inAnchor) {
+      return m;
+    }
+    if (filename.startsWith("//")) {
+      return m;
+    }
+    if (/https?:\/\/$/i.test(prefix)) {
+      return m;
+    }
+    return `${prefix}${filename}`;
+  });
+
+  return result;
 }
 
 export function renderTelegramHtmlText(
@@ -70,9 +180,13 @@ export function renderTelegramHtmlText(
 ): string {
   const textMode = options.textMode ?? "markdown";
   if (textMode === "html") {
-    return text;
+    // For HTML mode, still wrap file references in the HTML
+    return wrapFileReferencesInHtml(text);
   }
-  return markdownToTelegramHtml(text, { tableMode: options.tableMode });
+  const html = markdownToTelegramHtml(text, { tableMode: options.tableMode });
+  // Wrap file references after markdown→HTML conversion
+  // This ensures we only transform text nodes, not HTML attributes
+  return wrapFileReferencesInHtml(html);
 }
 
 export function markdownToTelegramChunks(
@@ -88,7 +202,7 @@ export function markdownToTelegramChunks(
   });
   const chunks = chunkMarkdownIR(ir, limit);
   return chunks.map((chunk) => ({
-    html: renderTelegramHtml(chunk),
+    html: wrapFileReferencesInHtml(renderTelegramHtml(chunk)),
     text: chunk.text,
   }));
 }
diff --git a/src/telegram/format.wrap-md.test.ts b/src/telegram/format.wrap-md.test.ts
new file mode 100644
index 00000000000..5ef7d1297c6
--- /dev/null
+++ b/src/telegram/format.wrap-md.test.ts
@@ -0,0 +1,101 @@
+import { describe, expect, it } from "vitest";
+import {
+  markdownToTelegramHtml,
+  renderTelegramHtmlText,
+  wrapFileReferencesInHtml,
+} from "./format.js";
+
+describe("wrapFileReferencesInHtml", () => {
+  it("wraps .md filenames in code tags", () => {
+    expect(wrapFileReferencesInHtml("Check README.md")).toContain("Check README.md");
+    expect(wrapFileReferencesInHtml("See HEARTBEAT.md for status")).toContain(
+      "See HEARTBEAT.md for status",
+    );
+  });
+
+  it("wraps .go filenames", () => {
+    expect(wrapFileReferencesInHtml("Check main.go")).toContain("Check main.go");
+  });
+
+  it("wraps .py filenames", () => {
+    expect(wrapFileReferencesInHtml("Run script.py")).toContain("Run script.py");
+  });
+
+  it("wraps .pl filenames", () => {
+    expect(wrapFileReferencesInHtml("Check backup.pl")).toContain("Check backup.pl");
+  });
+
+  it("wraps file paths", () => {
+    expect(wrapFileReferencesInHtml("Look at squad/friday/HEARTBEAT.md")).toContain(
+      "Look at squad/friday/HEARTBEAT.md",
+    );
+  });
+
+  it("does not wrap inside existing code tags", () => {
+    const input = "Already wrapped.md here";
+    const result = wrapFileReferencesInHtml(input);
+    expect(result).toBe(input);
+    expect(result).not.toContain("");
+  });
+
+  it("does not wrap inside pre tags", () => {
+    const input = "
README.md
"; + const result = wrapFileReferencesInHtml(input); + expect(result).toBe(input); + }); + + it("does not wrap inside anchor tags", () => { + const input = '
Link'; + const result = wrapFileReferencesInHtml(input); + expect(result).toBe(input); + }); + + it("does not wrap in URLs", () => { + const result = wrapFileReferencesInHtml("Visit https://example.com/README.md"); + expect(result).toContain('href="https://example.com/README.md"'); + expect(result).not.toContain("README.md"); + }); + + it("handles mixed content correctly", () => { + const result = wrapFileReferencesInHtml("Check README.md and CONTRIBUTING.md"); + expect(result).toContain("README.md"); + expect(result).toContain("CONTRIBUTING.md"); + }); + + it("handles edge cases", () => { + expect(wrapFileReferencesInHtml("No markdown files here")).not.toContain(""); + expect(wrapFileReferencesInHtml("File.md at start")).toContain("File.md"); + expect(wrapFileReferencesInHtml("Ends with file.md")).toContain("file.md"); + }); +}); + +describe("renderTelegramHtmlText - file reference wrapping", () => { + it("wraps file references in markdown mode", () => { + const result = renderTelegramHtmlText("Check README.md"); + expect(result).toContain("README.md"); + }); + + it("wraps file references in HTML mode", () => { + const result = renderTelegramHtmlText("Check README.md", { textMode: "html" }); + expect(result).toContain("README.md"); + }); + + it("does not double-wrap already code-formatted content", () => { + const result = renderTelegramHtmlText("Already `wrapped.md` here"); + // Should have code tags but not nested + expect(result).toContain(""); + expect(result).not.toContain(""); + }); +}); + +describe("markdownToTelegramHtml - file reference wrapping", () => { + it("wraps file references by default", () => { + const result = markdownToTelegramHtml("Check README.md"); + expect(result).toContain("README.md"); + }); + + it("can skip wrapping when requested", () => { + const result = markdownToTelegramHtml("Check README.md", { wrapFileRefs: false }); + expect(result).not.toContain("README.md"); + }); +});