/** * Parse the plain-text day-by-day format emitted by the `trip-itinerary` * AI task into the shape TripForm's `itinerary_days` state expects. * * The LLM is instructed (in the prompt template) to emit: * * Day 1: Arrive in Kathmandu * Welcome briefing at the hotel, group dinner, and an early night. * * Day 2: Fly to Lukla, trek to Phakding * ... * * In practice models sometimes drift — they may emit "Day 1 -" instead of * "Day 1:", wrap with markdown bold, or sneak in a leading preamble. The * parser tolerates these by: * - matching `^Day N` with a flexible separator (`:`, `-`, `–`, `—`) * - stripping leading/trailing whitespace + common markdown chars * - dropping anything before the first valid `Day N` block * - de-duping consecutive blank lines inside descriptions */ export interface ParsedDay { day: number; day_title: string; description: string; } /** * Match a "Day N: title" line, tolerant of LLM drift: * * - optional leading `#`s (markdown header) * - optional leading `*`s (markdown bold) * - "Day" / "day" / "DAY" * - `:`, `-`, `–` (en dash), `—` (em dash), or nothing-then-newline as separator * - optional trailing `*`s * * Plain trailing whitespace + control chars are tolerated. * * NOTE: trailing-asterisk group is `\**?` (ZERO-or-more, lazy), not * `\*+?` (one-or-more) — when the model emits a plain "Day 1: Arrive * in Kathmandu" with no markdown decoration, a one-or-more match * silently rejected every line and the wizard reported "0 days * drafted" even though the prose was perfectly valid. */ const DAY_HEAD = /^\s*(?:#+\s*)?(?:\*+\s*)?day\s+(\d+)\s*(?:[:\-–—]\s*)?(.*?)\s*\**?\s*$/i; export function parseItineraryText(text: string): ParsedDay[] { if (!text) return []; const lines = text.replace(/\r\n?/g, "\n").split("\n"); const days: ParsedDay[] = []; let current: ParsedDay | null = null; let descriptionBuffer: string[] = []; const flush = () => { if (current) { current.description = cleanDescription(descriptionBuffer); days.push(current); } current = null; descriptionBuffer = []; }; for (const rawLine of lines) { const line = rawLine.trimEnd(); const headMatch = DAY_HEAD.exec(line); if (headMatch) { flush(); const dayNum = parseInt(headMatch[1], 10); const title = (headMatch[2] || "").replace(/[\*_`]+/g, "").trim(); current = { day: Number.isFinite(dayNum) && dayNum > 0 ? dayNum : days.length + 1, day_title: title, description: "", }; continue; } if (current) { descriptionBuffer.push(line); } // lines before the first valid "Day N:" header are intentionally dropped } flush(); // Renumber if the LLM skipped or duplicated day numbers — preserve the // order it produced (callers care about positions, not the original // numeric labels). return days.map((d, idx) => ({ day: idx + 1, day_title: d.day_title, description: d.description, })); } function cleanDescription(buffer: string[]): string { // Drop leading blank lines, collapse runs of blank lines, strip markdown // emphasis that the LLM sometimes wraps individual sentences with. const trimmed = buffer.join("\n").replace(/\s+$/g, ""); return trimmed .replace(/^\s*\n+/g, "") .replace(/\n{3,}/g, "\n\n") .replace(/^[\*_]+|[\*_]+$/gm, "") .trim(); }