import type { AiSearchWebCrawlerSource } from "./ai-search.ts";

/**
 * Parse a URL and extract the domain and path
 */
function parseUrl(url: string): { domain: string; path: string } {
  // Handle URLs with or without protocol
  let normalized = url;
  if (!normalized.includes("://")) {
    normalized = `https://${normalized}`;
  }

  const parsed = new URL(normalized);
  return {
    domain: parsed.hostname,
    path: parsed.pathname,
  };
}

/**
 * Convert a path to a glob pattern for path filtering
 */
function pathToGlobPattern(path: string): string {
  // Remove leading slash for pattern
  const cleanPath = path.replace(/^\//, "");

  if (!cleanPath || cleanPath === "/") {
    // Root path - no filtering needed
    return "**";
  }

  // Create a pattern that matches this path and its children
  return `**/${cleanPath}**`;
}

/**
 * Builds an AiSearchWebCrawlerSource configuration from URLs.
 *
 * This is a convenience helper that parses URLs and extracts the domain
 * and path filters for use with AiSearch.
 *
 * @example
 * // Crawl an entire domain
 * const search = await AiSearch("docs-search", {
 *   source: AiCrawler(["https://docs.example.com"]),
 * });
 *
 * @example
 * // Crawl specific paths on a domain
 * const search = await AiSearch("blog-search", {
 *   source: AiCrawler([
 *     "https://example.com/blog",
 *     "https://example.com/news",
 *   ]),
 * });
 *
 * @param urls - URLs to crawl. All URLs must be from the same domain.
 * @returns An AiSearchWebCrawlerSource configuration object
 */
export function AiCrawler(urls: string[]): AiSearchWebCrawlerSource {
  if (!urls || urls.length === 0) {
    throw new Error("AiCrawler requires at least one URL");
  }

  // Parse all URLs
  const parsed = urls.map(parseUrl);

  // Verify all URLs are from the same domain
  const domains = new Set(parsed.map((p) => p.domain));
  if (domains.size > 1) {
    throw new Error(
      `All URLs must be from the same domain. Found: ${[...domains].join(", ")}`,
    );
  }

  const domain = parsed[0].domain;

  // Build include paths from URL paths
  // Only add path filters if we have specific paths (not just root)
  const paths = parsed.map((p) => p.path).filter((p) => p && p !== "/");
  const includePaths =
    paths.length > 0 ? paths.map(pathToGlobPattern) : undefined;

  return {
    type: "web-crawler",
    domain,
    includePaths,
  };
}