/** * SEO Handlers * * Business logic for sitemap generation and robots.txt. */ import { sql, type Kysely } from "kysely"; import type { Database } from "../../database/types.js"; import { validateIdentifier } from "../../database/validate.js"; import type { ApiResult } from "../types.js"; /** Raw content data for sitemap generation — the route builds the actual URLs */ export interface SitemapContentEntry { /** Content ID (ULID) */ id: string; /** Content slug, or null when the entry has no slug */ slug: string | null; /** ISO date of last modification */ updatedAt: string; /** * Locale of this row (e.g. `"en"`, `"fr"`). Always present — rows in * pre-i18n databases are backfilled to the configured `defaultLocale`. */ locale: string; /** * `translation_group` ULID shared across all locale variants of the * same content. Used by the sitemap route to emit `hreflang` * alternates between siblings. */ translationGroup: string | null; /** * Stored SEO image reference (`_emdash_seo.seo_image`), or null when * the entry has no SEO image. The route resolves it to an absolute * URL and emits it as an `` sitemap entry. */ image: string | null; } /** Per-collection sitemap data with entries and URL pattern */ export interface SitemapCollectionData { /** Collection slug (e.g., "post", "page") */ collection: string; /** URL pattern with {slug} placeholder, or null for default /{collection}/{slug} */ urlPattern: string | null; /** Most recent updated_at across all entries (for sitemap index lastmod) */ lastmod: string; /** Individual content entries */ entries: SitemapContentEntry[]; } export interface SitemapDataResponse { collections: SitemapCollectionData[]; } /** Maximum entries per sitemap (per spec) */ const SITEMAP_MAX_ENTRIES = 50_000; /** Matches a trailing timezone designator (`Z` or `±HH`, `±HHMM`, `±HH:MM`). */ const TZ_SUFFIX_RE = /([zZ]|[+-]\d{2}(:?\d{2})?)$/; /** * Normalize a stored timestamp to W3C Datetime (ISO 8601) for sitemaps. * * `updated_at` is not guaranteed to be ISO: the column default is * `datetime('now')` on SQLite and `CURRENT_TIMESTAMP` on Postgres, both of * which store a space-separated `YYYY-MM-DD HH:MM:SS` string (and imported * content can carry other shapes). The sitemap `` field requires * W3C Datetime, and Google Search Console rejects the space-separated form * as "Invalid date". Normalize defensively, assuming UTC when no offset is * present (matches SQLite's `datetime('now')`). Valid date strings are * normalized to UTC ISO 8601; unparseable values are returned as-is. */ function toW3CDate(value: string): string { if (!value) return value; let normalized = value.trim(); if (normalized.includes(" ") && !normalized.includes("T")) { normalized = normalized.replace(" ", "T"); } if (!TZ_SUFFIX_RE.test(normalized)) { normalized += "Z"; } const parsed = Date.parse(normalized); return Number.isNaN(parsed) ? value : new Date(parsed).toISOString(); } /** * Collect all published, indexable content across SEO-enabled collections * for sitemap generation, grouped by collection. * * Only includes content from collections with `has_seo = 1`. * Excludes content with `seo_no_index = 1` in the `_emdash_seo` table. * * Returns raw data grouped per collection. The caller (route) is * responsible for building absolute URLs — this handler does NOT * assume a URL structure. */ export async function handleSitemapData( db: Kysely, /** When set, only return data for this collection. */ collectionSlug?: string, ): Promise> { try { // Find SEO-enabled collections (optionally filtered) let query = db .selectFrom("_emdash_collections") .select(["slug", "url_pattern"]) .where("has_seo", "=", 1); if (collectionSlug) { query = query.where("slug", "=", collectionSlug); } const collections = await query.execute(); const result: SitemapCollectionData[] = []; for (const col of collections) { // Validate the slug before using it as a table name identifier. // Should always pass (slugs are validated on creation), but // guards against corrupted DB data. try { validateIdentifier(col.slug, "collection slug"); } catch { console.warn(`[SITEMAP] Skipping collection with invalid slug: ${col.slug}`); continue; } const tableName = `ec_${col.slug}`; // Query published, non-deleted content. // LEFT JOIN _emdash_seo to check noindex flag. // Content without an SEO row is assumed indexable (default). // Wrapped in try/catch so a missing/broken table doesn't fail the // entire sitemap — we skip that collection and continue. try { const rows = await sql<{ slug: string | null; id: string; updated_at: string; locale: string; translation_group: string | null; seo_image: string | null; }>` SELECT c.slug, c.id, c.updated_at, c.locale, c.translation_group, s.seo_image FROM ${sql.ref(tableName)} c LEFT JOIN _emdash_seo s ON s.collection = ${col.slug} AND s.content_id = c.id WHERE c.status = 'published' AND c.deleted_at IS NULL AND (s.seo_no_index IS NULL OR s.seo_no_index = 0) ORDER BY c.updated_at DESC LIMIT ${SITEMAP_MAX_ENTRIES} `.execute(db); if (rows.rows.length === 0) continue; const entries: SitemapContentEntry[] = []; for (const row of rows.rows) { entries.push({ id: row.id, slug: row.slug, updatedAt: toW3CDate(row.updated_at), locale: row.locale, translationGroup: row.translation_group, image: row.seo_image ?? null, }); } result.push({ collection: col.slug, urlPattern: col.url_pattern, // Rows are ordered by updated_at DESC, so first row is the latest lastmod: toW3CDate(rows.rows[0].updated_at), entries, }); } catch (err) { // Table missing or query error — skip this collection console.warn(`[SITEMAP] Failed to query collection "${col.slug}":`, err); continue; } } return { success: true, data: { collections: result } }; } catch (error) { console.error("[SITEMAP_ERROR]", error); return { success: false, error: { code: "SITEMAP_ERROR", message: "Failed to generate sitemap data" }, }; } }