/** * URL detection + display helpers for the chat editor's "URL chip" node. * Pure, side-effect-free, and unit-testable — no TipTap / ProseMirror / * React imports live here so the same functions back the editor node and * any standalone use alike. * * Design goal mirrors `../filePath/detect.ts`: **LOW false-positive**. We * only treat a token as a URL when it carries an explicit web scheme * (`http`/`https`/`ftp`/`mailto`/`file`) or a bare `www.` host — exactly * the set the editor's `shouldAutoLinkUrl` whitelist already accepts, so * the chip never disagrees with the autolinker. Bare file paths and dotted * filenames (`manifest.md`) are intentionally NOT matched here — those are * the file-path node's job. */ // ── Match result ── export interface UrlMatch { /** Inclusive start offset into the scanned string. */ start: number; /** Exclusive end offset into the scanned string. */ end: number; /** * The matched URL text, normalised to a navigable href (a bare `www.` * host gets an `https://` prefix so the chip's anchor works). Trailing * prose punctuation is trimmed. */ href: string; /** The raw matched text exactly as it appeared (no scheme added). */ raw: string; } /** * Trailing sentence punctuation that, when it hangs off the END of a * detected URL token, is almost certainly prose and not part of the URL — * trim it. (A URL effectively never ends in these for display purposes; * a real trailing `)` that balances an opening `(` is the classic edge * case but rare in a chat composer and not worth the complexity.) */ const TRAILING_PUNCT = new Set([ '.', ',', ')', ']', '}', ';', ':', '!', '?', '«', '»', '"', "'", '”', '’', '…', ]); /** * URL shapes, as a single alternation. Each alternative is whitespace- * bounded by construction (`\S` runs). Order: explicit schemes first, then * the bare `www.` host. * * - `mailto:user@host` — mail scheme (no `//`). * - `scheme://host/...` — http/https/ftp/file with authority. * - `www.host...` — schemeless web host (we prepend https). */ const URL_RE = new RegExp( [ String.raw`mailto:[^\s]+`, String.raw`(?:https?|ftp|file):\/\/[^\s]+`, String.raw`www\.[^\s/$.?#][^\s]*`, ].join('|'), 'giu', ); function trimTrailingPunct(raw: string): string { let end = raw.length; while (end > 1 && TRAILING_PUNCT.has(raw[end - 1]!)) { end -= 1; } return raw.slice(0, end); } /** Normalise a raw token to a navigable href (prepend https for `www.`). */ export function normalizeHref(raw: string): string { if (/^www\./i.test(raw)) return `https://${raw}`; return raw; } /** * Find every URL in `text`. Returns matches in document order with * `{ start, end, href, raw }` where `[start, end)` are offsets into the * ORIGINAL string (after trailing-punctuation trimming). File paths and * bare relative tokens are intentionally NOT matched. */ export function findUrls(text: string): UrlMatch[] { if (!text) return []; const out: UrlMatch[] = []; URL_RE.lastIndex = 0; let m: RegExpExecArray | null; while ((m = URL_RE.exec(text)) !== null) { const raw = m[0]; const start = m.index; if (raw.length === 0) { URL_RE.lastIndex += 1; continue; } // Leading-boundary guard: a real URL token starts the string or // follows whitespace / an opening delimiter. Glued onto a preceding // non-space char it's a substring of a larger token — reject. if (start > 0) { const prev = text[start - 1]!; if (!/[\s([{<«"'‘“]/.test(prev)) continue; } const trimmed = trimTrailingPunct(raw); if (trimmed.length < 4) continue; out.push({ start, end: start + trimmed.length, href: normalizeHref(trimmed), raw: trimmed, }); } return out; } // ── Split / label (Chrome-style domain + middle-ellipsis path) ── export interface SplitUrl { /** Host without a leading `www.` (e.g. `github.com`). Empty if unparseable. */ domain: string; /** Host for favicon lookup (with `www.` stripped). */ host: string; /** Path + query + hash, leading `/` included (empty for a bare host). */ rest: string; /** `mailto:` address (no scheme) when the URL is a mail link, else ''. */ mail: string; } /** * Split an href into `{ domain, host, rest, mail }`. Uses the platform * `URL` parser; falls back to a light regex when `URL` throws (it * shouldn't for our matched shapes, but stay robust). */ export function splitUrl(href: string): SplitUrl { if (/^mailto:/i.test(href)) { const mail = href.slice('mailto:'.length); const at = mail.lastIndexOf('@'); const domain = at >= 0 ? mail.slice(at + 1) : ''; return { domain, host: domain, rest: '', mail }; } try { const u = new URL(normalizeHref(href)); const host = u.hostname.replace(/^www\./i, ''); const rest = `${u.pathname === '/' ? '' : u.pathname}${u.search}${u.hash}`; return { domain: host, host: u.hostname, rest, mail: '' }; } catch { // Best-effort fallback for an unparseable token. const noScheme = normalizeHref(href).replace(/^[a-z]+:\/\//i, ''); const slash = noScheme.indexOf('/'); const host = (slash >= 0 ? noScheme.slice(0, slash) : noScheme).replace(/^www\./i, ''); const rest = slash >= 0 ? noScheme.slice(slash) : ''; return { domain: host, host, rest, mail: '' }; } } const ELLIPSIS = '…'; export interface UrlLabelOptions { /** * Approximate max characters for the WHOLE label (domain + path). The * domain is always kept in full; only the path's middle collapses. * Default 40. */ maxChars?: number; } /** * Produce a compact, Chrome-omnibox-style label for a URL: the domain is * sacred (always shown in full), then the path's MIDDLE collapses to `…` * while the LAST path segment (often the meaningful slug / filename) is * kept. * * Examples (maxChars ≈ 40): * https://github.com/wailsapp/wails/blob/main/v3/README.md * → github.com/…/README.md * https://github.com/wailsapp/wails * → github.com/wailsapp/wails (short — unchanged) * mailto:me@example.com → me@example.com */ export function truncateUrlLabel(href: string, opts: UrlLabelOptions = {}): string { const maxChars = opts.maxChars ?? 40; const { domain, rest, mail } = splitUrl(href); if (mail) { return mail.length <= maxChars ? mail : `${ELLIPSIS}${mail.slice(mail.length - (maxChars - 1))}`; } const full = `${domain}${rest}`; if (full.length <= maxChars) return full; // Collapse the path's middle, keeping the last segment. const segments = rest.split('/').filter((s) => s.length > 0); if (segments.length === 0) { // Only a domain, but it's longer than maxChars — return as-is (CSS // ellipsis is the safety net in the chip). return domain; } const last = segments[segments.length - 1]!; const candidates: string[] = []; if (segments.length >= 2) { candidates.push(`${domain}/${segments[0]!}/${ELLIPSIS}/${last}`); } candidates.push(`${domain}/${ELLIPSIS}/${last}`); candidates.push(`${domain}/${ELLIPSIS}`); for (const c of candidates) { if (c.length <= maxChars) return c; } return `${domain}/${ELLIPSIS}/${last}`; } /** * Favicon URL for a host via Google's S2 service. Degrades to the globe * icon on error (the chip wires an `onerror` handler). Returns '' when the * host is empty. */ export function faviconUrl(host: string, size = 32): string { if (!host) return ''; return `https://www.google.com/s2/favicons?domain=${encodeURIComponent(host)}&sz=${size}`; }