/**
 * Curated list of code block languages.
 *
 * Used as suggestions in the editor's language picker. The picker accepts
 * free-form text, so this is a starting point, not a restriction. The `id`
 * is the canonical identifier persisted in the Portable Text `language`
 * field and emitted as a `language-{id}` CSS class on the frontend.
 *
 * Aliases let common variants ("typescript", "ts") resolve to the same id.
 * Frontend highlighters (shipped in a follow-up PR) will use this map to
 * normalize unknown inputs.
 */

import type { MessageDescriptor } from "@lingui/core";
import { msg } from "@lingui/core/macro";

export interface CodeBlockLanguage {
	/** Canonical identifier persisted in storage and emitted as `language-{id}`. */
	id: string;
	/** Human-readable label shown in the picker. */
	label: MessageDescriptor;
	/** Alternative identifiers (typed by the user) that resolve to this language. */
	aliases?: string[];
}

export const CODE_BLOCK_LANGUAGES: readonly CodeBlockLanguage[] = [
	{ id: "plaintext", label: msg`Plain text`, aliases: ["text", "plain", "txt"] },
	{ id: "astro", label: msg`Astro` },
	{ id: "bash", label: msg`Bash`, aliases: ["sh", "shell", "zsh"] },
	{ id: "c", label: msg`C` },
	{ id: "cpp", label: msg`C++`, aliases: ["c++"] },
	{ id: "csharp", label: msg`C#`, aliases: ["cs", "c#"] },
	{ id: "css", label: msg`CSS` },
	{ id: "diff", label: msg`Diff`, aliases: ["patch"] },
	{ id: "dockerfile", label: msg`Dockerfile`, aliases: ["docker"] },
	{ id: "go", label: msg`Go`, aliases: ["golang"] },
	{ id: "graphql", label: msg`GraphQL`, aliases: ["gql"] },
	{ id: "html", label: msg`HTML` },
	{ id: "java", label: msg`Java` },
	{ id: "javascript", label: msg`JavaScript`, aliases: ["js"] },
	{ id: "json", label: msg`JSON` },
	{ id: "jsx", label: msg`JSX` },
	{ id: "kotlin", label: msg`Kotlin`, aliases: ["kt"] },
	{ id: "markdown", label: msg`Markdown`, aliases: ["md"] },
	{ id: "mdx", label: msg`MDX` },
	{ id: "php", label: msg`PHP` },
	{ id: "python", label: msg`Python`, aliases: ["py"] },
	{ id: "ruby", label: msg`Ruby`, aliases: ["rb"] },
	{ id: "rust", label: msg`Rust`, aliases: ["rs"] },
	{ id: "scss", label: msg`SCSS`, aliases: ["sass"] },
	{ id: "sql", label: msg`SQL` },
	{ id: "svelte", label: msg`Svelte` },
	{ id: "swift", label: msg`Swift` },
	{ id: "toml", label: msg`TOML` },
	{ id: "tsx", label: msg`TSX` },
	{ id: "typescript", label: msg`TypeScript`, aliases: ["ts"] },
	{ id: "vue", label: msg`Vue` },
	{ id: "xml", label: msg`XML` },
	{ id: "yaml", label: msg`YAML`, aliases: ["yml"] },
];

/**
 * Look up a language by id or alias. Case-insensitive.
 * Returns the canonical entry, or `null` if not found in the curated list.
 */
export function findLanguage(value: string | null | undefined): CodeBlockLanguage | null {
	if (!value) return null;
	const searchText = value.trim().toLowerCase();
	if (!searchText) return null;
	for (const lang of CODE_BLOCK_LANGUAGES) {
		if (lang.id === searchText) return lang;
		if (lang.aliases?.includes(searchText)) return lang;
	}
	return null;
}

/**
 * Normalize a user-entered language string to a canonical id where possible.
 * Unknown inputs are sanitized to a single safe class token: lowercased,
 * trimmed, with any character outside `[a-z0-9_-]` (including whitespace,
 * dots, slashes, etc.) collapsed to `-`. This keeps the stored value safe
 * to interpolate into a `language-{id}` CSS class without splitting on
 * whitespace.
 *
 * Examples:
 *   normalizeLanguage("TypeScript")   -> "typescript" (canonical id)
 *   normalizeLanguage("ts")           -> "typescript" (alias)
 *   normalizeLanguage("Objective C")  -> "objective-c" (sanitized)
 *   normalizeLanguage("F#")           -> "f-" (sanitized)
 *   normalizeLanguage("")             -> undefined
 */
// Hoisted to module scope to avoid re-compilation on every call.
const DISALLOWED_CHARS_RE = /[^a-z0-9_-]+/g;
const LEADING_TRAILING_HYPHENS_RE = /^-+|-+$/g;

export function normalizeLanguage(value: string | null | undefined): string | undefined {
	if (!value) return undefined;
	const trimmed = value.trim();
	if (!trimmed) return undefined;
	const match = findLanguage(trimmed);
	if (match) return match.id;
	// Sanitize unknown input: lowercase, then collapse runs of disallowed
	// characters into a single `-` so the result is always a single CSS class
	// token. We deliberately don't return `undefined` on collisions here --
	// callers can compare the sanitized result with the input if they need to.
	const sanitized = trimmed
		.toLowerCase()
		.replace(DISALLOWED_CHARS_RE, "-")
		.replace(LEADING_TRAILING_HYPHENS_RE, "");
	return sanitized || undefined;
}

/**
 * Human-readable label for a stored language id. Falls back to the id itself
 * for unknown values so the editor never shows "undefined".
 */
export function languageLabelDescriptor(
	value: string | null | undefined,
): MessageDescriptor | string {
	if (!value) return msg`Plain text`;
	const match = findLanguage(value);
	if (match) return match.label;
	return value;
}