{"version":3,"file":"data-indexer.cjs","sources":["../../../src/utils/data-indexer.ts"],"sourcesContent":["/**\n * Data Indexer Utility\n * Extract unique words from various data formats for fuzzy search indexing\n */\n\nimport { tokenize } from \"./tokenizer.js\";\n\nexport interface DataToIndexOptions {\n  /** Minimum word length to include (default: 2) */\n  minLength?: number;\n  /** Split text into words (default: true) */\n  splitWords?: boolean;\n  /** Remove stop words (default: false) */\n  stopWords?: string[] | false;\n  /** Overlap between chunks in characters (default: 0) */\n  overlap?: number;\n  /** Size of each chunk in characters (default: 0 = no chunking) */\n  chunkSize?: number;\n  /** Split strategy for chunking (default: 'word') */\n  splitOn?: \"word\" | \"sentence\" | \"paragraph\";\n  /** Data format (default: 'string') */\n  format?: \"string\" | \"html\" | \"json\" | \"base64\" | \"url\";\n  /** Remove numbers (default: false) */\n  removeNumbers?: boolean;\n  /** Case sensitive (default: false) */\n  caseSensitive?: boolean;\n}\n\n/**\n * Extract unique words from various data formats\n * Returns an array of unique words that can be used as a dictionary for fuzzy search\n *\n * @param content - The content to extract words from\n * @param options - Configuration options\n * @returns Array of unique words (no duplicates)\n *\n * @example\n * // Simple text\n * const words = dataToIndex(\"Hello world! Hello again.\");\n * // → ['hello', 'world', 'again']\n *\n * @example\n * // HTML content\n * const words = dataToIndex(\"<h1>Title</h1><p>Content here</p>\", { format: 'html' });\n * // → ['title', 'content', 'here']\n *\n * @example\n * // JSON data\n * const data = [{ name: \"John\", city: \"NYC\" }, { name: \"Jane\", city: \"LA\" }];\n * const words = dataToIndex(JSON.stringify(data), { format: 'json' });\n * // → ['john', 'nyc', 'jane', 'la']\n */\nexport function dataToIndex(\n  //\n  content: string,\n  options: DataToIndexOptions = {}\n): string[] {\n  const {\n    //\n    minLength = 2,\n    splitWords = true,\n    stopWords = false,\n    overlap = 0,\n    chunkSize = 0,\n    splitOn = \"word\",\n    format = \"string\",\n    removeNumbers = false,\n    caseSensitive = false,\n  } = options;\n\n  let text = content;\n\n  // Step 1: Handle different formats\n  switch (format) {\n    case \"base64\":\n      try {\n        text = atob(content);\n      } catch (e) {\n        console.error(\"Failed to decode base64:\", e);\n        return [];\n      }\n      break;\n\n    case \"html\":\n      text = stripHTML(content);\n      break;\n\n    case \"json\":\n      text = extractFromJSON(content);\n      break;\n\n    case \"url\":\n      // URL format requires async, so we'll throw an error\n      throw new Error(\"URL format requires async. Use dataToIndexAsync() instead.\");\n\n    case \"string\":\n    default:\n      // Already a string, no conversion needed\n      break;\n  }\n\n  // Step 2: Apply chunking if specified\n  if (chunkSize > 0) {\n    const chunks = chunkText(text, chunkSize, overlap, splitOn);\n    text = chunks.join(\" \");\n  }\n\n  // Step 3: Extract words\n  let words: string[] = [];\n\n  if (splitWords) {\n    // Use centralized tokenizer for consistent word boundary handling\n    words = tokenize(text, { keepEmpty: false });\n  } else {\n    words = [text];\n  }\n\n  // Step 4: Clean and filter words\n  words = words\n    .map((word) => {\n      // Remove leading/trailing punctuation (but preserve unicode letters)\n      word = word.replace(/^[^\\p{L}\\p{N}]+|[^\\p{L}\\p{N}]+$/gu, \"\");\n\n      // Convert case\n      if (!caseSensitive) {\n        word = word.toLowerCase();\n      }\n\n      return word;\n    })\n    .filter((word) => {\n      // Filter by minimum length\n      if (word.length < minLength) return false;\n\n      // Filter numbers if requested\n      if (removeNumbers && /^\\d+$/.test(word)) return false;\n\n      return true;\n    });\n\n  // Step 5: Remove stop words if specified\n  if (stopWords && Array.isArray(stopWords)) {\n    const stopWordsSet = new Set(stopWords.map((w) => w.toLowerCase()));\n    words = words.filter((word) => !stopWordsSet.has(word.toLowerCase()));\n  }\n\n  // Step 6: Remove duplicates and return\n  return Array.from(new Set(words));\n}\n\n/**\n * Strip HTML tags and extract text content\n */\nfunction stripHTML(html: string): string {\n  // Remove script and style tags with their content\n  let text = html.replace(/<script\\b[^<]*(?:(?!<\\/script>)<[^<]*)*<\\/script>/gi, \" \");\n  text = text.replace(/<style\\b[^<]*(?:(?!<\\/style>)<[^<]*)*<\\/style>/gi, \" \");\n\n  // Remove HTML comments\n  text = text.replace(/<!--[\\s\\S]*?-->/g, \" \");\n\n  // Remove all HTML tags\n  text = text.replace(/<[^>]+>/g, \" \");\n\n  // Decode common HTML entities\n  text = text\n    .replace(/&nbsp;/g, \" \")\n    .replace(/&amp;/g, \"&\")\n    .replace(/&lt;/g, \"<\")\n    .replace(/&gt;/g, \">\")\n    .replace(/&quot;/g, '\"')\n    .replace(/&#39;/g, \"'\")\n    .replace(/&apos;/g, \"'\");\n\n  // Normalize whitespace\n  text = text.replace(/\\s+/g, \" \").trim();\n\n  return text;\n}\n\n/**\n * Extract string values from JSON\n */\nfunction extractFromJSON(jsonString: string): string {\n  try {\n    const data = JSON.parse(jsonString);\n    const values: string[] = [];\n\n    function extractValues(obj: any, depth: number = 0): void {\n      // Limit recursion depth to prevent stack overflow\n      if (depth > 10) return;\n\n      if (typeof obj === \"string\") {\n        values.push(obj);\n      } else if (Array.isArray(obj)) {\n        obj.forEach((item) => extractValues(item, depth + 1));\n      } else if (typeof obj === \"object\" && obj !== null) {\n        Object.values(obj).forEach((value) => extractValues(value, depth + 1));\n      }\n    }\n\n    extractValues(data);\n    return values.join(\" \");\n  } catch (e) {\n    console.error(\"Failed to parse JSON:\", e);\n    return \"\";\n  }\n}\n\n/**\n * Chunk text into smaller pieces\n */\nfunction chunkText(\n  //\n  text: string,\n  chunkSize: number,\n  overlap: number,\n  splitOn: \"word\" | \"sentence\" | \"paragraph\"\n): string[] {\n  const chunks: string[] = [];\n\n  if (splitOn === \"paragraph\") {\n    // Split on double newlines\n    const paragraphs = text.split(/\\n\\n+/);\n    let currentChunk = \"\";\n\n    for (const para of paragraphs) {\n      if ((currentChunk + para).length <= chunkSize) {\n        currentChunk += (currentChunk ? \"\\n\\n\" : \"\") + para;\n      } else {\n        if (currentChunk) chunks.push(currentChunk);\n        currentChunk = para;\n      }\n    }\n    if (currentChunk) chunks.push(currentChunk);\n  } else if (splitOn === \"sentence\") {\n    // Split on sentence boundaries\n    const sentences = text.split(/[.!?]+\\s+/);\n    let currentChunk = \"\";\n\n    for (const sentence of sentences) {\n      if ((currentChunk + sentence).length <= chunkSize) {\n        currentChunk += (currentChunk ? \" \" : \"\") + sentence;\n      } else {\n        if (currentChunk) chunks.push(currentChunk);\n        currentChunk = sentence;\n      }\n    }\n    if (currentChunk) chunks.push(currentChunk);\n  } else {\n    // Split on words (default)\n    const words = text.split(/\\s+/);\n    let currentChunk = \"\";\n\n    for (const word of words) {\n      if ((currentChunk + \" \" + word).length <= chunkSize) {\n        currentChunk += (currentChunk ? \" \" : \"\") + word;\n      } else {\n        if (currentChunk) chunks.push(currentChunk);\n\n        // Add overlap\n        if (overlap > 0 && currentChunk) {\n          const overlapWords = currentChunk.split(/\\s+/).slice(-Math.ceil(overlap / 10));\n          currentChunk = overlapWords.join(\" \") + \" \" + word;\n        } else {\n          currentChunk = word;\n        }\n      }\n    }\n    if (currentChunk) chunks.push(currentChunk);\n  }\n\n  return chunks;\n}\n\n/**\n * Async version for URL fetching\n * @param content - URL or content string\n * @param options - Configuration options\n * @returns Promise<string[]> Array of unique words\n */\nexport async function dataToIndexAsync(\n  //\n  content: string,\n  options: DataToIndexOptions = {}\n): Promise<string[]> {\n  const { format = \"string\" } = options;\n\n  if (format === \"url\") {\n    try {\n      const response = await fetch(content);\n      const html = await response.text();\n      return dataToIndex(html, { ...options, format: \"html\" });\n    } catch (e) {\n      console.error(\"Failed to fetch URL:\", e);\n      return [];\n    }\n  }\n\n  return dataToIndex(content, options);\n}\n"],"names":["tokenize"],"mappings":";;;AAoDO,SAAS,YAEd,SACA,UAA8B,IACpB;AACV,QAAM;AAAA;AAAA,IAEJ,YAAY;AAAA,IACZ,aAAa;AAAA,IACb,YAAY;AAAA,IACZ,UAAU;AAAA,IACV,YAAY;AAAA,IACZ,UAAU;AAAA,IACV,SAAS;AAAA,IACT,gBAAgB;AAAA,IAChB,gBAAgB;AAAA,EAAA,IACd;AAEJ,MAAI,OAAO;AAGX,UAAQ,QAAA;AAAA,IACN,KAAK;AACH,UAAI;AACF,eAAO,KAAK,OAAO;AAAA,MACrB,SAAS,GAAG;AACV,gBAAQ,MAAM,4BAA4B,CAAC;AAC3C,eAAO,CAAA;AAAA,MACT;AACA;AAAA,IAEF,KAAK;AACH,aAAO,UAAU,OAAO;AACxB;AAAA,IAEF,KAAK;AACH,aAAO,gBAAgB,OAAO;AAC9B;AAAA,IAEF,KAAK;AAEH,YAAM,IAAI,MAAM,4DAA4D;AAAA,EAK5E;AAIJ,MAAI,YAAY,GAAG;AACjB,UAAM,SAAS,UAAU,MAAM,WAAW,SAAS,OAAO;AAC1D,WAAO,OAAO,KAAK,GAAG;AAAA,EACxB;AAGA,MAAI,QAAkB,CAAA;AAEtB,MAAI,YAAY;AAEd,YAAQA,UAAAA,SAAS,MAAM,EAAE,WAAW,OAAO;AAAA,EAC7C,OAAO;AACL,YAAQ,CAAC,IAAI;AAAA,EACf;AAGA,UAAQ,MACL,IAAI,CAAC,SAAS;AAEb,WAAO,KAAK,QAAQ,qCAAqC,EAAE;AAG3D,QAAI,CAAC,eAAe;AAClB,aAAO,KAAK,YAAA;AAAA,IACd;AAEA,WAAO;AAAA,EACT,CAAC,EACA,OAAO,CAAC,SAAS;AAEhB,QAAI,KAAK,SAAS,UAAW,QAAO;AAGpC,QAAI,iBAAiB,QAAQ,KAAK,IAAI,EAAG,QAAO;AAEhD,WAAO;AAAA,EACT,CAAC;AAGH,MAAI,aAAa,MAAM,QAAQ,SAAS,GAAG;AACzC,UAAM,eAAe,IAAI,IAAI,UAAU,IAAI,CAAC,MAAM,EAAE,YAAA,CAAa,CAAC;AAClE,YAAQ,MAAM,OAAO,CAAC,SAAS,CAAC,aAAa,IAAI,KAAK,YAAA,CAAa,CAAC;AAAA,EACtE;AAGA,SAAO,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAClC;AAKA,SAAS,UAAU,MAAsB;AAEvC,MAAI,OAAO,KAAK,QAAQ,uDAAuD,GAAG;AAClF,SAAO,KAAK,QAAQ,oDAAoD,GAAG;AAG3E,SAAO,KAAK,QAAQ,oBAAoB,GAAG;AAG3C,SAAO,KAAK,QAAQ,YAAY,GAAG;AAGnC,SAAO,KACJ,QAAQ,WAAW,GAAG,EACtB,QAAQ,UAAU,GAAG,EACrB,QAAQ,SAAS,GAAG,EACpB,QAAQ,SAAS,GAAG,EACpB,QAAQ,WAAW,GAAG,EACtB,QAAQ,UAAU,GAAG,EACrB,QAAQ,WAAW,GAAG;AAGzB,SAAO,KAAK,QAAQ,QAAQ,GAAG,EAAE,KAAA;AAEjC,SAAO;AACT;AAKA,SAAS,gBAAgB,YAA4B;AACnD,MAAI;AAIF,QAAS,gBAAT,SAAuB,KAAU,QAAgB,GAAS;AAExD,UAAI,QAAQ,GAAI;AAEhB,UAAI,OAAO,QAAQ,UAAU;AAC3B,eAAO,KAAK,GAAG;AAAA,MACjB,WAAW,MAAM,QAAQ,GAAG,GAAG;AAC7B,YAAI,QAAQ,CAAC,SAAS,cAAc,MAAM,QAAQ,CAAC,CAAC;AAAA,MACtD,WAAW,OAAO,QAAQ,YAAY,QAAQ,MAAM;AAClD,eAAO,OAAO,GAAG,EAAE,QAAQ,CAAC,UAAU,cAAc,OAAO,QAAQ,CAAC,CAAC;AAAA,MACvE;AAAA,IACF;AAdA,UAAM,OAAO,KAAK,MAAM,UAAU;AAClC,UAAM,SAAmB,CAAA;AAezB,kBAAc,IAAI;AAClB,WAAO,OAAO,KAAK,GAAG;AAAA,EACxB,SAAS,GAAG;AACV,YAAQ,MAAM,yBAAyB,CAAC;AACxC,WAAO;AAAA,EACT;AACF;AAKA,SAAS,UAEP,MACA,WACA,SACA,SACU;AACV,QAAM,SAAmB,CAAA;AAEzB,MAAI,YAAY,aAAa;AAE3B,UAAM,aAAa,KAAK,MAAM,OAAO;AACrC,QAAI,eAAe;AAEnB,eAAW,QAAQ,YAAY;AAC7B,WAAK,eAAe,MAAM,UAAU,WAAW;AAC7C,yBAAiB,eAAe,SAAS,MAAM;AAAA,MACjD,OAAO;AACL,YAAI,aAAc,QAAO,KAAK,YAAY;AAC1C,uBAAe;AAAA,MACjB;AAAA,IACF;AACA,QAAI,aAAc,QAAO,KAAK,YAAY;AAAA,EAC5C,WAAW,YAAY,YAAY;AAEjC,UAAM,YAAY,KAAK,MAAM,WAAW;AACxC,QAAI,eAAe;AAEnB,eAAW,YAAY,WAAW;AAChC,WAAK,eAAe,UAAU,UAAU,WAAW;AACjD,yBAAiB,eAAe,MAAM,MAAM;AAAA,MAC9C,OAAO;AACL,YAAI,aAAc,QAAO,KAAK,YAAY;AAC1C,uBAAe;AAAA,MACjB;AAAA,IACF;AACA,QAAI,aAAc,QAAO,KAAK,YAAY;AAAA,EAC5C,OAAO;AAEL,UAAM,QAAQ,KAAK,MAAM,KAAK;AAC9B,QAAI,eAAe;AAEnB,eAAW,QAAQ,OAAO;AACxB,WAAK,eAAe,MAAM,MAAM,UAAU,WAAW;AACnD,yBAAiB,eAAe,MAAM,MAAM;AAAA,MAC9C,OAAO;AACL,YAAI,aAAc,QAAO,KAAK,YAAY;AAG1C,YAAI,UAAU,KAAK,cAAc;AAC/B,gBAAM,eAAe,aAAa,MAAM,KAAK,EAAE,MAAM,CAAC,KAAK,KAAK,UAAU,EAAE,CAAC;AAC7E,yBAAe,aAAa,KAAK,GAAG,IAAI,MAAM;AAAA,QAChD,OAAO;AACL,yBAAe;AAAA,QACjB;AAAA,MACF;AAAA,IACF;AACA,QAAI,aAAc,QAAO,KAAK,YAAY;AAAA,EAC5C;AAEA,SAAO;AACT;AAQA,eAAsB,iBAEpB,SACA,UAA8B,IACX;AACnB,QAAM,EAAE,SAAS,SAAA,IAAa;AAE9B,MAAI,WAAW,OAAO;AACpB,QAAI;AACF,YAAM,WAAW,MAAM,MAAM,OAAO;AACpC,YAAM,OAAO,MAAM,SAAS,KAAA;AAC5B,aAAO,YAAY,MAAM,EAAE,GAAG,SAAS,QAAQ,QAAQ;AAAA,IACzD,SAAS,GAAG;AACV,cAAQ,MAAM,wBAAwB,CAAC;AACvC,aAAO,CAAA;AAAA,IACT;AAAA,EACF;AAEA,SAAO,YAAY,SAAS,OAAO;AACrC;;;"}