{"version":3,"file":"tokenizer.cjs","sources":["../../../src/utils/tokenizer.ts"],"sourcesContent":["/**\n * Centralized tokenization utilities for consistent word boundary handling\n * \n * This module provides a single source of truth for how text is split into tokens,\n * ensuring consistent behavior across indexing, search, and phrase matching.\n */\n\n/**\n * Word boundary characters - these separate tokens\n * Includes: whitespace, hyphens, underscores, punctuation, brackets, quotes, slashes\n */\nexport const WORD_BOUNDARY_CHARS = /[\\s\\-_.,;:!?()[\\]{}'\"\\/\\\\#@$%^&*+=<>|~`]/;\n\n/**\n * Word boundary pattern for splitting text into tokens\n */\nexport const WORD_BOUNDARY_PATTERN = /[\\s\\-_.,;:!?()[\\]{}'\"\\/\\\\#@$%^&*+=<>|~`]+/;\n\n/**\n * Tokenize text into words by splitting on word boundaries\n * \n * This is the core tokenization function used throughout the library.\n * It splits on common delimiters while preserving alphanumeric content.\n * \n * @param text - Text to tokenize\n * @param options - Tokenization options\n * @returns Array of tokens\n * \n * @example\n * ```typescript\n * tokenize(\"api_manager_3254\") // [\"api\", \"manager\", \"3254\"]\n * tokenize(\"hello-world\") // [\"hello\", \"world\"]\n * tokenize(\"user@email.com\") // [\"user\", \"email\", \"com\"]\n * tokenize(\"snake_case_var\") // [\"snake\", \"case\", \"var\"]\n * ```\n */\nexport function tokenize(\n  text: string,\n  options: {\n    /** Keep empty tokens (default: false) */\n    keepEmpty?: boolean;\n    /** Minimum token length (default: 0) */\n    minLength?: number;\n    /** Convert to lowercase (default: false) */\n    lowercase?: boolean;\n  } = {}\n): string[] {\n  const { keepEmpty = false, minLength = 0, lowercase = false } = options;\n\n  // Split on word boundaries\n  let tokens = text.split(WORD_BOUNDARY_PATTERN);\n\n  // Filter empty tokens unless explicitly kept\n  if (!keepEmpty) {\n    tokens = tokens.filter((token) => token.length > 0);\n  }\n\n  // Apply minimum length filter\n  if (minLength > 0) {\n    tokens = tokens.filter((token) => token.length >= minLength);\n  }\n\n  // Apply lowercase transformation\n  if (lowercase) {\n    tokens = tokens.map((token) => token.toLowerCase());\n  }\n\n  return tokens;\n}\n\n/**\n * Check if a character is a word boundary\n * \n * @param char - Character to check\n * @returns True if the character is a word boundary\n */\nexport function isWordBoundaryChar(char: string): boolean {\n  return WORD_BOUNDARY_CHARS.test(char);\n}\n\n/**\n * Tokenize and also return the original text with tokens\n * Useful for highlighting and position tracking\n * \n * @param text - Text to tokenize\n * @returns Object with tokens and original text\n */\nexport function tokenizeWithPositions(text: string): {\n  tokens: string[];\n  positions: { token: string; start: number; end: number }[];\n} {\n  const tokens: string[] = [];\n  const positions: { token: string; start: number; end: number }[] = [];\n\n  let currentToken = \"\";\n  let tokenStart = 0;\n\n  for (let i = 0; i < text.length; i++) {\n    const char = text[i];\n\n    if (isWordBoundaryChar(char)) {\n      // End of token\n      if (currentToken.length > 0) {\n        tokens.push(currentToken);\n        positions.push({\n          token: currentToken,\n          start: tokenStart,\n          end: i,\n        });\n        currentToken = \"\";\n      }\n      tokenStart = i + 1;\n    } else {\n      // Part of token\n      if (currentToken.length === 0) {\n        tokenStart = i;\n      }\n      currentToken += char;\n    }\n  }\n\n  // Add final token if exists\n  if (currentToken.length > 0) {\n    tokens.push(currentToken);\n    positions.push({\n      token: currentToken,\n      start: tokenStart,\n      end: text.length,\n    });\n  }\n\n  return { tokens, positions };\n}\n\n/**\n * Join tokens back into text with a separator\n * \n * @param tokens - Tokens to join\n * @param separator - Separator to use (default: space)\n * @returns Joined text\n */\nexport function joinTokens(tokens: string[], separator: string = \" \"): string {\n  return tokens.join(separator);\n}\n\n/**\n * Normalize text for search by tokenizing and rejoining\n * This ensures consistent handling of special characters\n * \n * @param text - Text to normalize\n * @param options - Normalization options\n * @returns Normalized text\n */\nexport function normalizeForSearch(\n  text: string,\n  options: {\n    lowercase?: boolean;\n    separator?: string;\n  } = {}\n): string {\n  const { lowercase = true, separator = \" \" } = options;\n  const tokens = tokenize(text, { lowercase });\n  return joinTokens(tokens, separator);\n}\n"],"names":[],"mappings":";;AAgBO,MAAM,wBAAwB;AAoB9B,SAAS,SACd,MACA,UAOI,IACM;AACV,QAAM,EAAE,YAAY,OAAO,YAAY,GAAG,YAAY,UAAU;AAGhE,MAAI,SAAS,KAAK,MAAM,qBAAqB;AAG7C,MAAI,CAAC,WAAW;AACd,aAAS,OAAO,OAAO,CAAC,UAAU,MAAM,SAAS,CAAC;AAAA,EACpD;AAGA,MAAI,YAAY,GAAG;AACjB,aAAS,OAAO,OAAO,CAAC,UAAU,MAAM,UAAU,SAAS;AAAA,EAC7D;AAGA,MAAI,WAAW;AACb,aAAS,OAAO,IAAI,CAAC,UAAU,MAAM,aAAa;AAAA,EACpD;AAEA,SAAO;AACT;;;"}